1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118
| import re import random import ssl import urllib.request
import pymysql as pymysql from lxml import etree
def decode_html(html, charsets=('UTF-8','GBK')): """ 解码页面 """ page_html = '' for charset in charsets: try: page_html = html.decode(charset) break except Exception as e: pass return page_html
def get_html_response(url): """ 获取页面响应信息 """ headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36', } context = ssl._create_unverified_context() request = urllib.request.Request(url, headers=headers) response = urllib.request.urlopen(request, context=context) return decode_html(response.read()) def pattern_regex(html, parrent, flags=re.S): """ 定义正则配方法 :param html: 请求的页面 :param parrent: 正则表达式 :param flags: 标志 :return: 匹配结果(列表形式) """ parrent = re.compile(parrent, flags) result = re.findall(parrent, html) return result
def get_mysql(sql, params_list): """ 连接数据库 :param sql: sql插入语句 :param params_list: 存储至数据库的字段的值 """ conn = pymysql.Connect(host='localhost', port=3306, user='root', password='root', database='spider', charset='utf8') with conn.cursor() as cursor: cursor.executemany(sql, params_list) conn.commit()
def start_crawl(url): """ 使用正则获取网页信息 :param url: 请求网页的网址 """ html = get_html_response(url) link_list = pattern_regex(html, "<a test=a href='(.*?)' target='_blank'>.*?")
params_list = [] for link_url in link_list: html = get_html_response(link_url) title = pattern_regex(html, '<h1>(.*?)<span.*?</h1>') content = pattern_regex(html, '<article class="article" id="mp-editor">(.*?)</article>') if title: params_list.append([title[0], content[0]]) sql = 'insert into sohu(title,content) values(%s, %s) ' get_mysql(sql, params_list)
def start_crawl_xpath(url): """ 使用Xpath获取网页信息 :param url: 请求网页的链接 """ html = etree.HTML(get_html_response(url)) li_list = html.xpath('/html/body/div[1]/div[4]/div[1]/div[1]/ul/li') params_list = [] for li in li_list: a_link = li.xpath('./a/@href') if a_link: content_html = etree.HTML(get_html_response(a_link[0])) title = content_html.xpath('//*[@id="article-container"]/div[2]/div[1]/div[1]/h1/text()') content = content_html.xpath('//*[@id="mp-editor"]/p/text()') if title: params_list.append([title[0], content[0]]) sql = 'insert into sohu(title,content) values(%s, %s) ' get_mysql(sql, params_list)
if __name__ == '__main__': url = 'http://sports.sohu.com/nba_a.shtml' start_crawl_xpath(url)
|
评论系统未开启,无法评论!