Python爬虫之抓取豆瓣电影TOP250 | 奇点产品

/ 0评 / 1

环境配置

运行平台:MAC

Python版本:Python 3.6

IDE:pycharm

浏览器:Chrome浏览器

思路流程

1.查看网页源代码

2.抓取页面内容

3.豆瓣TOP250所有信息写入文件

 

1,查看网页源代码

输入网址:https://movie.douban.com/top250?start=25&filter=

可以看到界面如下

我们发现每一页是start=,增加25,这样就可以确定链接格式了。

按F12查看网页源代码,查看标签信息,发现每一个电影的信息都在“<li></li>”标签之中。

例如这里title,star,scor等。后边提取页面信息会用到。

2,抓取网页信息

需要使用的python库(这里我们分别用三个方式尝试)

requests
re # 正则表达式
json
BeautifulSoup # BS
lxml # xpath

 

抓取代码如下:

1.1获取html

def get_one_page(url): 
    response = requests.get(url) 
    if response.status_code == 200: 
        return response.text 
    return None

 

1.2 正则获取

def zhengze_parse(html): 
    pattern = re.compile( '<em class="">(.*?)</em>.*?<img.*?alt="(.*?)".*?src="(.*?)".*?property="v:average">(.*?)</span>.*?<span>(.*?)</span>.*?' + 'class="inq">(.*?)</span>', re.S) 
    items = re.findall(pattern, html) 
    for item in items: 
        yield { 'index': item[0], 'image': item[2], 'title': item[1], 'people': item[4].strip()[:-2], 'score': item[3], 'Evaluation': item[5] }

 

1.3 BS获取

def soup_parse(html):
    soup = BeautifulSoup(html, 'lxml')
    for data in soup.find_all('div', class_='item'):
        index = data.em.text
        image = data.img['src']
        title = data.img['alt']
        people = data.find_all('span')[-2].text[:-2]
        score = data.find('span', class_='rating_num').text
        # 第125个影片没有描述,用空代替
        if data.find('span', class_='inq'):
            Evaluation = data.find('span', class_='inq').text
        else:
            Evaluation = ''

        yield { 'index': index, 'image': image, 'title': title, 'people': people, 'score': score, 'Evaluation': Evaluation, }

 

1.4 XPATH获取

def xpath_parse(html): 
    html = etree.HTML(html) 
    for data in html.xpath('//ol[@class="grid_view"]/li'): 
        index = data.xpath('.//em/text()')[0] 
        image = data.xpath('.//a/img/@src')[0] 
        title = data.xpath('.//a/img/@alt')[0] 
        people = data.xpath('.//div[@class="star"]/span[4]/text()')[0][:-2] 
        score = data.xpath('.//div[@class="star"]/span[2]/text()')[0] 
        # 第125个影片没有描述,用空代替 
        if data.xpath('.//p[@class="quote"]/span/text()'): 
            Evaluation = data.xpath('.//p[@class="quote"]/span/text()')[0] 
        else: 
            Evaluation = '' 
        yield { 'index': index, 'image': image, 'title': title, 'people': people, 'score': score, 'Evaluation': Evaluation, }

 

1.5 写入文件

def write_to_file(content, flag): 
    with open('豆瓣电影TOP250(' + str(flag) + ').txt', 'a', encoding='utf-8')as f: 
f.write(json.dumps(content, ensure_ascii=False) + '\n')

 

3,完整代码

import requests 
import re 
# 正则表达式 
import json 
from bs4 import BeautifulSoup  
from lxml import etree # xpath 

def get_one_page(url): 
    response = requests.get(url) 
    if response.status_code == 200: 
        return response.text 
    return None 
        
def zhengze_parse(html): 
    pattern = re.compile( '<em class="">(.*?)</em>.*?<img.*?alt="(.*?)".*?src="(.*?)".*?property="v:average">(.*?)</span>.*?<span>(.*?)</span>.*?' + 'class="inq">(.*?)</span>', re.S) 
    items = re.findall(pattern, html) 
    for item in items: 
        yield { 'index': item[0], 'image': item[2], 'title': item[1], 'people': item[4].strip()[:-2], 'score': item[3], 'Evaluation': item[5] } 
        
def soup_parse(html): 
    soup = BeautifulSoup(html, 'lxml') 
    for data in soup.find_all('div', class_='item'): 
        index = data.em.text 
        image = data.img['src'] 
        title = data.img['alt'] 
        people = data.find_all('span')[-2].text[:-2] 
        score = data.find('span', class_='rating_num').text 
        # 第125个影片没有描述,用空代替 
        if data.find('span', class_='inq'): 
            Evaluation = data.find('span', class_='inq').text 
        else: 
            Evaluation = '' 
            yield { 'index': index, 'image': image, 'title': title, 'people': people, 'score': score, 'Evaluation': Evaluation, } 
            
def xpath_parse(html): 
    html = etree.HTML(html) 
    for data in html.xpath('//ol[@class="grid_view"]/li'): 
        index = data.xpath('.//em/text()')[0] 
        image = data.xpath('.//a/img/@src')[0] 
        title = data.xpath('.//a/img/@alt')[0] 
        people = data.xpath('.//div[@class="star"]/span[4]/text()')[0][:-2] 
        score = data.xpath('.//div[@class="star"]/span[2]/text()')[0] 
        # 第125个影片没有描述,用空代替 
        if data.xpath('.//p[@class="quote"]/span/text()'): 
            Evaluation = data.xpath('.//p[@class="quote"]/span/text()')[0] 
        else: 
            Evaluation = '' 
            yield { 'index': index, 'image': image, 'title': title, 'people': people, 'score': score, 'Evaluation': Evaluation, } 
            
def write_to_file(content, flag): 
    with open('豆瓣电影TOP250(' + str(flag) + ').txt', 'a', encoding='utf-8')as f: 
        f.write(json.dumps(content, ensure_ascii=False) + '\n') 
        
def search(Num):
    url = 'https://movie.douban.com/top250?start=' + str(Num) 
    html = get_one_page(url) 
    for item in zhengze_parse(html): write_to_file(item, '正则表达式') 
    for item in soup_parse(html): write_to_file(item, 'BS4') 
    for item in xpath_parse(html): write_to_file(item, 'xpath') 
    page = str(Num / 25 + 1) 
    print("正在爬取第" + page[:-2] + '页') 

def main(): 
    # 提供页码 
    for i in range(0, 10): Num = i * 25 
    search(Num) 
    print("爬取完成") 
    
if __name__ == '__main__': 
    # 入口 
    main()