PYTHON 七月 31, 2019

8.用Scrapy框架进行爬取的案例

文章字数 27k 阅读约需 25 mins. 阅读次数 1000000


只有一个页面,无需跳转的爬取


案例文件下载

创建项目: scrapy startproject 项目名

scrapy startproject myFirstScrapy

创建爬虫: scrapy genspider 爬虫名 域名

scrapy genspideer intersting u148.cn

编写items.py文件,对接需求

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy

#item本质是一个字典对象,用于对页面上的待抓取的那些元素进行模型化,主要对接需求分析
class MyfirstscrapyItem(scrapy.Item):
    # define the fields for your item here like:
    title = scrapy.Field()
    img_url = scrapy.Field()
    info = scrapy.Field()
    next_page = scrapy.Field()

编写爬虫(spiders目录下的爬虫文件intersting.py),并且进行解析,连接数据库时需要先创建数据库以及对应的表

# -*- coding: utf-8 -*-
import scrapy
# 导入item
from myFirstScrapy.items import MyfirstscapyItem
class InterstingSpider(scrapy.Spider):
    # 这个类是基本爬虫类,提供了爬虫解析函数,以及爬虫初始url等信息

    name = 'intersting'
    # name爬虫的名字,在调用爬虫会根据爬虫的名字来唯一识别该爬虫

    allowed_domains = ['u148.cn'] # 下载器在去下载网页的,首先会对比目标网址是不是在被允许的域名下

    start_urls = ['http://www.u148.cn/music/'] # 起始地址,引擎被调度起来以后首先会从这里提取初始url

    def parse(self, response):
        # 当下载器把待下载的url内容下载完以后,就会回调这个方法
        # print(response.text) # response对象保存下载器下完的内容
        # 对response进行解析
        music_list = response.xpath("//article[starts-with(@class,'ajaxpost box')]")
        print(music_list) # Selector xpath="//article[starts-with(@class,'ajaxpost box')]" data='<article class="ajaxpost box triangle wo'>
        # 在scrapy自带xpath中我们选取出来的对象是列表中存放该规则对应那些标签构成selector对象
        # extract()函数用于从Selector对象中取出对应的标签
        # 定义一个列表,用于存储所有的音乐信息
        musics = []
        # 遍历所有的音乐
        for music in music_list:
            # 创建一个item模型
            item = MyfirstscapyItem()
            item["title"] = music.xpath("./article/h2/a/text()").extract()[0]
            # print(item["title"])
            item["img_url"] = music.xpath(".//img[@class='thumb']/@data-original
").extract()[0]
            item["info"] = music.xpath(".//div[@class='excerpt']/text()").extract()[0]
            item["next_page"] = music.xpath("./article/h2/a/@href").extract()[0]
            # 每解析出一条就将其存入列表
            musics.append(item)
            # print(item)
        return musics
        # 要求必须返回和一个可迭代的对象
        # 返回的这个对象将会被scrapy框架输出(可以输出到日志中、也可以输出到本地文件中、也可以迭代输出到管道中等)

更改settings.py文件,根据需要进行修改配置


BOT_NAME = 'myFirstScrapy'

SPIDER_MODULES = ['myFirstScrapy.spiders']
NEWSPIDER_MODULE = 'myFirstScrapy.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'myFirstScrapy (+http://www.yourdomain.com)'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.204 Safari/537.36'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False


# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
# 管道组件,默认是被关闭的,一旦这个组件被打开,管道文件就被引擎调用,
# 并且爬虫中返回的那个可迭代对象也会被传递到管道中
# 里面的键值对代表的是一个管道。键代表组件的位置,值代表的是管道的优先级
# (如果多个管道,优先级较高管道优先获得爬虫传递的数据),数字越大优先级级越低
ITEM_PIPELINES = {
   'myFirstScrapy.pipelines.MyfirstscrapyPipeline': 300,
#开启自定义的csv管道,数字表示优先级,数字越小,优先级越高,先执行
   'myFirstScrapy.pipelines.CSVPipelines': 310,
}

根据需求处理解析后的数据pipelines.py(管道文件)

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import csv
import pymysql

class MyfirstscrapyPipeline(object):
    def __init__(self):
        pass
    # 当爬虫开启的时候这个方法被调用
    def open_spider(self,spider):
        print("爬虫开始了")
        print(spider)
        # 创建数据库连接
        self.conn = pymysql.connect(
            host = 'localhost',
            port = 3306,
            user = 'root',
            password = 'root',
            db = 'scrapy_test',
            charset = 'utf8'
        )
        #创建游标,将sql语句发送至mysql服务器进行执行
        self.cur = self.conn.cursor()


    # 这个成员方法,是一个回调函数,当爬虫向管道传递数据的时候,管道会对这些数据进行迭代,
    # 每迭代一次就会调用一次这个方法,每调用一次就会把当前正在访问的那个item传过来,于此
    # 同时还会把spider对象传递过来
    def process_item(self, item, spider):
        print(item)
        print(spider)
        # 每遍历一条信息插入一次
        sql = "insert into u148 values(0,'%s','%s','%s','%s')"%(
            item['title'],
            item['img_url'],
            item['info'],
            item['next_page']
        )
        # 用游标把sql语句发生给服务器
        self.cur.execute(sql)
        self.conn.commit()
        # 管道默认会把item 返回
        # 如果不返回出去这个item那么比他优先级低的管道无法使用该item
        return item

    def close_spider(self,spider):
        print('爬虫被关闭了')
        self.conn.close()
        self.cur.close()
        # print(spider)

# 创建一个管道用于存储csv数据
class CSVPipelines(object):
    def open_spider(self,spider):
        self.mycsv = open("interst.csv",'w',encoding="utf-8")
        self.csvwriter = csv.writer(self.mycsv)
        # 写一个表头
        self.csvwriter.writerow(["title","img_url","info","next_page"])
        # 定义一个变量,用于整合所有的item信息
        self.items = []

    def process_item(self, item, spider):
        # 整合一个小列表
        csvitem = []
        csvitem.append(item["title"])
        csvitem.append(item["img_url"])
        csvitem.append(item["info"])
        csvitem.append(item["next_page"])
        # 将小列表存入大列表中
        self.items.append(csvitem)
        return item

    def close_spider(self,spider):
        self.csvwriter.writerows(self.items)
        self.mycsv.close()

运行项目

命令格式:scrapy crawl 爬虫名 [-o 文件名.json/xml/csv] 
scrapy crawl intersting [-o intersting.csv/xml/json]

只有一个页面,但是有分页的爬取


案例文件下载


items.py

# -*- coding: utf-8 -*-
# Define here the models for your scraped items
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class SchoolflowerItem(scrapy.Item):
    # 姓名
    name = scrapy.Field()
    # 图片
    img = scrapy.Field()
    # 简介
    info = scrapy.Field()
    # 详情页url
    url = scrapy.Field()

sipders/Beauties.py

# -*- coding: utf-8 -*-
import scrapy
from SchoolFlower.items import SchoolflowerItem

class BeautiesSpider(scrapy.Spider):
    name = 'Beauties'
    allowed_domains = ['daxues.cn']
    start_urls = ['http://news.daxues.cn/xiaohua/ziliao/']
"""
['http://news.daxues.cn/xiaohua/ziliao/','http://news.daxues.cn/xiaohua/ziliao/index_2.html','http://news.daxues.cn/xiaohua/ziliao/index_3.html']
"""

    def parse(self, response):
        # print(response)
        girl_list = response.xpath("//div[@class='xh_list']/dl")
        # print(girl_list)
        # 遍历
        for girl in girl_list:
            item = SchoolflowerItem()
            item["name"] = girl.xpath("./dt/a/text()").extract()[0]
            item["img"] = "http://news.daxues.cn" + girl.xpath("./a/img/@src").extract()[0]
            item["info"] = girl.xpath("./dd/text()").extract()[0]
            item["url"] = "http://news.daxues.cn" + girl.xpath("./dt/a/@href").extract()[0]

            yield item

        # 增加待爬取的页面url,并且手动调用下载器,去下载
        # 定义一个变量,用于记录当前是第几页
        self.page = 1
        self.page += 1
        if self.page <=3:
            # 重新的拼接当前的url
            url = self.start_urls[0] + "index_" + str(self.page) + ".html"
            # 调用scrapy的下载器去下载页面
            yield scrapy.Request(url=url,callback=self.parse)# 回调函数仍然使用parse,原因是所有的页面的解析都是一样

settings.py

# 1.打开管道文件
# 2.设置UA
# 3.robots.txt 规则
# 4.···

运行结果:


一级页面与二级页面的爬取


案例文件下载

一级页面:

二级页面:

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy

class MoviespiderItem(scrapy.Item):
    # 一级页面
    # title
    title = scrapy.Field()
    # 简介
    info = scrapy.Field()
    # 日期
    date = scrapy.Field()
    # 二级页面
    # 海报
    img = scrapy.Field()
    # 剧情
    story = scrapy.Field()
    # 下载链接
    downloader = scrapy.Field()

spiders/movies.py

# -*- coding: utf-8 -*-
import scrapy
from MovieSpider.items import MoviespiderItem

class MovieSpider(scrapy.Spider):
    name = 'movie'
    allowed_domains = ['dytt8.net']
    start_urls = ['http://www.dytt8.net/html/gndy/dyzz/index.html']

    def parse(self, response):

        # 从响应体中提取出所有的电影信息
        movie_list = response.xpath("//div[@class='co_content8']//table")
        print(movie_list)
        # 遍历所有的电影,提取出详细的信息
        for movie in movie_list:
            # 创建一个模型
            item = MoviespiderItem()
            # 用item提取一级页面中的内容
            item["title"] = movie.xpath(".//a/text()").extract_first()
            item['date'] = movie.xpath(".//font/text()").extract_first()
            item['info'] = movie.xpath(".//tr[last()]/td/text()").extract_first()

            # 获取二级页面中的内容
            next_url = "http://www.dytt8.net" + movie.xpath(".//a/@href").extract_first()
            # 此时需要继续从二级页面中提取信息,就需要调用下载器继续下载
            yield scrapy.Request(url=next_url,callback=self.parse_next,meta={"movie_item":item})
            # Request下载器,有一个参数叫meta,它可以把meta里面的内容作为响应对象的一个属性传递出去


    # 定义一个成员方法,用于解析二级页面
    def parse_next(self,response):
        # item = MoviespiderItem()
        # 提取出上个页面中未完成item
        item = response.meta["movie_item"]

        # 在二级页面中提取其他的信息并存入item
        # 提取海报连接
        item['img'] = response.xpath("//div[@id='Zoom']//img[1]/@src").extract_first()
        # 提取剧情
        item["story"] = response.xpath("//div[@id='Zoom']").xpath("string(.)").extract_first()
        # 下载连接
        item["downloader"] = response.xpath("//td[@bgcolor='#fdfddf']/a/@href").extract_first()
        yield item

settigs.py 同上

运行结果:


JS动态加载的页面数据的爬取


案例文件下载

每件商品下的原始代码

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy
class VipspdierItem(scrapy.Item):
    # 商品介绍
    info = scrapy.Field()
    # 原价
    orin_price = scrapy.Field()
    # 现价
    price = scrapy.Field()
    # 折扣
    discount = scrapy.Field()
    # 二级页面url
    next_url = scrapy.Field()

spider/vip.py

# -*- coding: utf-8 -*-
import scrapy
from VipSpdier.items import VipspdierItem

class VipSpider(scrapy.Spider):
    name = 'vip'
    allowed_domains = ['vip.com']
    start_urls = ['http://category.vip.com/suggest.php?keyword=%E8%BF%9E%E8%A1%A3%E8%A3%99']

    def parse(self, response):
        goods_list = response.xpath("//div[starts-with(@id,'J_pro_')]")
        print(goods_list)
        for goods in goods_list:
            item = VipspdierItem()
            item["info"] = goods.xpath(".//a[@rel='noopener']/text()").extract_first()
            # 其他,自己写

            # 详情页url
            next_url = "http:" +goods.xpath(".//a[@rel='noopener']/@href").extract_first()
            yield scrapy.Request(url=next_url,callback=self.parse_next)

    def parse_next(self,response):
        print(response)
        pass

settings.py 新增了一个下载中间件的开启

# 下载中间件组件,这个组件如果不开启,默认开启的是系统的下载组件
DOWNLOADER_MIDDLEWARES = {
   'VipSpdier.middlewares.VipspdierDownloaderMiddleware': 543,
   'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware':None # 这个下载中间件在系统框架内部,如果开启这个中间件下载器的用户代理就会默认启动
}

middlewares.py

在VipspdierDownloaderMiddleware类下的process_request方法中添加一个浏览器进行js代码的解析(注:JS代码必须由浏览器内核进行解析,所以需要用到selenium添加一个浏览器,而不用系统本身的代码)

# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals
from selenium import webdriver
from time import sleep
from scrapy.http import HtmlResponse


class VipspdierSpiderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.

        # Should return None or raise an exception.
        return None

    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.

        # Must return an iterable of Request, dict or Item objects.
        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Response, dict
        # or Item objects.
        pass

    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.

        # Must return only requests (not items).
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)


class VipspdierDownloaderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        # 由于下载器无法解析js,在这里可以植入一个selenium驱动,用浏览器来请求
        driver = webdriver.PhantomJS()

        driver.get(request.url)
        sleep(3)
        # 获取浏览器中的页面源码
        body = driver.page_source
        print("=========================")
        print("目前正在访问:",request.url)
        # 把提取到的网页源码构建成响应体对象返回出去
        return HtmlResponse(driver.current_url,body=body,encoding='utf-8',request=request)

    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.

        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response

    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.

        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)

运行结果:



post请求类型页面数据的爬取(案例 :百度翻译)


spiders/baidu.py

# -*- coding: utf-8 -*-
import scrapy


class BaiduSpider(scrapy.Spider):
    name = 'baidu'
    allowed_domains = ['baidu.com']
    # start_urls = ['http://baidu.com/']
    #
    # def parse(self, response):
    #     pass
    # 如果用post请求,需要重写下载的调度
    def start_requests(self):
        # 这个方法请求还没有发起的时候调用
        post_url = "http://fanyi.baidu.com/sug"
        # 请求体
        data = {
            "kw":"hello"
        }

        # 发起post请求
        yield scrapy.FormRequest(url=post_url,formdata=data,callback=self.parse_post)


    def parse_post(self,response):
        print(response.text)
        pass

增量爬取(分页不知道上限)


案例文件下载

一级页面:

一级页面下底部分页:不确定分页数

二级页面:


创建spiders目录下的爬虫文件的命令: scrapy genspider -t crawl 爬虫名 域名

scrapy genspider -t crawl dushu dushu.com

items.py

import scrapy


class DushuspiderItem(scrapy.Item):
    # 一级页面
    # 书名
    name = scrapy.Field()
    # 作者
    author = scrapy.Field()
    # 封面
    cover_img = scrapy.Field()
    # 二级页面
    # 出版社
    cbs = scrapy.Field()
    # 内容简介
    content = scrapy.Field()
    # 作者简介
    author_info = scrapy.Field()
    # 定价
    price = scrapy.Field()
    # 目录
    mulu = scrapy.Field()

dushu.py

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
# crawlspider 是scrapy提供的基于basic一种更高级模板的爬虫,这种爬虫,可以通过一定的规则为下载器提供大批量的link,下载器可以自动的调用这些连接
from DushuSpider.items import DushuspiderItem

class DushuSpider(CrawlSpider):
    name = 'dushu'
    allowed_domains = ['dushu.com']
    start_urls = ['http://www.dushu.com/book/1163.html']

    rules = (
        Rule(LinkExtractor(allow=r'/book/1163_\d+\.html'), callback='parse_item', follow=True),
    )
    # 通过rules来对爬虫进行量的扩充,rules是一个元组,里面包含多个Rule对象
    # Rule对象第一个参数LinkExtractor里面传递一个匹配url的一个规则(可以是正则、xpath、bs4等规则),第二个参数回调函数,Rule规则匹配到url以后,就会把这些url全部交给调度器,调度器调用下载器,下载结束以后,就会回调回调函数,【注意】回调函数这里写成字符串
    # 【注意】Rule规则会自动的把一些无效的url剔除

    # LinkExtractor规则:
    # allow代表 用正则来匹配(常用)
    # xpath代表用 xpath语法来匹配
    # css代表用 css选择器来匹配

    def parse_item(self, response):
        book_list = response.xpath("//div[@class='bookslist']/ul/li")
        for book in book_list:
            item = DushuspiderItem()
            item["name"] = book.xpath(".//h3/a/text()").extract_first()
            item["author"] = book.xpath(".//p/a/text()").extract_first()
            item["cover_img"] = book.xpath(".//img/@data-original").extract_first()
            # 跳转二级页面
            next = "http://www.dushu.com" + book.xpath(".//h3/a/@href").extract_first()
            yield scrapy.Request(url=next,callback=self.parse_next,meta={"book":item})

    def parse_next(self, response):
        item = response.meta["book"]
        item['cbs'] = response.xpath("//div[@class='book-details-left']/table//tr[2]").extract_first()
        item['content'] = response.xpath("//div[@class='text txtsummary']").extract()[0]
        item['author_info'] = response.xpath("//div[@class='text txtsummary']").extract()[1]
        item['price'] = response.xpath("//p[@class='price']/span/text()").extract_first()

        m = response.xpath("//div[starts-with(@class,'text txtsummary')]/text()")
        if len(m) == 3:
            # 说明这时候书有目录
            item["mulu"] = m.extract()[2]
        else:
            item["mulu"] = ''
        yield item

settings.py

开启管道,robote.txt规则,UA等

pipelines.py 将数据存入数据库(需要先在数据库中创建库,创建对应的表)

import pymysql
class DushuspiderPipeline(object):

    def open_spider(self,spider):
        self.conn = pymysql.connect(
            host='127.0.0.1',
            port=3306,
            user = 'root',
            password='root',
            db='dushudb',
            charset='utf8'
        )
        self.cur = self.conn.cursor()

    def process_item(self, item, spider):
        # sql语句
        sql = "INSERT INTO books VALUES(NULL,'%s','%s','%s','%s','%s','%s','%s','%s')" % (item['name'],item['author'],item['cover_img'],item['cbs'],item['content'],item['author_info'],item['price'],item['mulu'])
        self.cur.execute(sql)
        self.conn.commit()
        return item

    def close_spider(self,spider):
        self.cur.close()
        self.conn.close()

运行结果:


上一篇:
下一篇:
0%