博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
day_3:解析
阅读量:4993 次
发布时间:2019-06-12

本文共 3722 字,大约阅读时间需要 12 分钟。

1、XPath:from lxml import etree

选取节点(所有节点:*)

属性匹配

html.xpath('节点名称[@属性名称="属性"]')

html.xpath('节点名称[contains(@属性名称, "属性")]')  多属性匹配选一匹配

   如:<p class="a b">....</p>

    html.xpath('//p[contains(@class, "a")]')

html.xpath('节点名称[contains(@属性名称, "属性") and @属性名称="属性"]')  多属性匹配两个都匹配

   如:<p class="a" name="b">....</p>

    html.xpath('//p[contains(@class, "a") and @name="b"]')

文本获取:/text()

html.xpath('节点名称[@属性名称="属性"]/text()')

属性获取:@属性名称

html.xpath('节点名称/@属性名称')

2、Beautiful Soup:form bs4 import BeautifulSoup

3、PyQuery:from pyquery import PyQuery as pq

属性获取:.attr(属性名称)或者.attr.属性名称

文本获取:.text()

from lxml import etreeimport reimport requestsdef get_html(url):    headers = {        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'}    try:        r = requests.get(url, headers=headers)        r.raise_for_status()        return r.text    except:        print('status_code is not 200')        return Nonedef parse_str(str):    str = re.match('\s+主演:(.*?)\s+', str)    return str.group(1)def parse_time(str):    txt = re.search('\d{4}(-\d{2}-\d{2})*', str)    return txt.group()def parse_html(html, info_list):    html = etree.HTML(html)    names = html.xpath('//p[@class="name"]/a/text()')    ranks = html.xpath('//i[contains(@class, "board-index")]/text()')    stars = list(map(parse_str, html.xpath('//p[@class="star"]/text()')))    times = list(map(parse_time, html.xpath('//p[@class="releasetime"]/text()')))    integers = html.xpath('//i[@class="integer"]/text()')    fractions = html.xpath('//i[@class="fraction"]/text()')    for rank, name, actor, ts, integer, fraction in zip(ranks, names, stars, times, integers, fractions):        info_list.append({            'rank': rank,            'name': name,            'actor': actor,            'time': ts,            'score': integer + fraction        })if __name__ == '__main__':    url = 'http://maoyan.com/board/4'    info_list = []    for i in range(10):        path = url + '?offset=' + str(i*10)        print(path)        html = get_html(path)        if html:            parse_html(html, info_list)    for info in info_list:        print(info)
from pyquery import PyQueryimport reimport requestsdef get_html(url):    headers = {        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'}    try:        r = requests.get(url, headers=headers)        r.raise_for_status()        return r.text    except:        print('status_code is not 200')        return Nonedef parse_time(str):    txt = re.search('\d{4}(-\d{2}-\d{2})*', str)    return txt.group()def parse_html(html, info_list):    doc = PyQuery(html)    dd_nodes = doc('dl.board-wrapper')    ranks = dd_nodes('.board-index').items()    names = dd_nodes('.name').items()    actors = dd_nodes('.star').items()    times = dd_nodes('.releasetime').items()    integers = dd_nodes('.integer').items()    fractions = dd_nodes('.fraction').items()    for rank, name, actor, ts, integer, fraction in zip(ranks, names, actors, times, integers, fractions):        info_list.append({            'rank': rank.text(),            'name': name.text(),            'actor': actor.text().replace('主演:', ''),            'time': parse_time(ts.text()),            'score': integer.text() + fraction.text()        })if __name__ == '__main__':    url = 'http://maoyan.com/board/4'    info_list = []    for i in range(10):        path = url + '?offset=' + str(i*10)        print(path)        html = get_html(path)        if html:            parse_html(html, info_list)    for info in info_list:        print(info)

 

转载于:https://www.cnblogs.com/jp-mao/p/10009558.html

你可能感兴趣的文章
再说virtual
查看>>
随笔:技术流可以这样写博客
查看>>
[优化]JavaScript 格式化带有占位符字符串
查看>>
打JAR包
查看>>
大图轮播
查看>>
UNIX环境高级编程读书笔记
查看>>
java awt 乱码问题
查看>>
矩阵中的路径
查看>>
unity回调函数范例
查看>>
linux下给php安装curl、gd(ubuntu)
查看>>
Java自带的Logger使用-代码摘要
查看>>
Java设计模式系列 — 构造器模式
查看>>
MySQL执行计划explain的key_len解析
查看>>
Windows Phone开发(9):关于页面状态 转:http://blog.csdn.net/tcjiaan/article/details/7292160...
查看>>
android 通过数组,流播放声音的方法
查看>>
Spring入门篇
查看>>
JAVA遇见HTML——JSP篇(JSP状态管理)
查看>>
启动eclipse出现错误Java was started but returned exit =一个数字
查看>>
myBatis模糊查找
查看>>
数据结构与算法之五 链接列表
查看>>