pyspider爬虫爬取电影网站代码

pyspider爬虫爬取电影网站代码

2017-11-30 / 0 评论 / 133 阅读 / 正在检测是否收录...
温馨提示:
本文最后更新于2021年10月27日,已超过1151天没有更新,若内容或图片失效,请留言反馈。
#!/usr/bin/env python# -*- encoding: utf-8 -*-
# Created on 2017-11-30 15:46:23
# Project: ttwanda_3
from pyspider.libs.base_handler import *
import re
import json
from pyspider.libs.utils import md5string
class Handler(BaseHandler):
    crawl_config = {
    }
    @every(minutes=24 * 60)
    def on_start(self):
        self.crawl('http://www.ttwanda.com', callback=self.index_page)
    @config(age=10 * 24 * 60 * 60)
    def index_page(self, response):
        for each in response.doc('a[href^="http"]').items():
            if re.match(u"http://www.ttwanda.com/film/page/\d+|http://www.ttwanda.com/film$", each.attr.href):
                self.result={}
                self.crawl(each.attr.href, callback=self.film_list_page, save=self.result)
    def film_list_page(self, response):
        for each in response.doc('article.u-movie').items():
            self.result = response.save
            self.result['poster'] = each('img').attr['data-original']
            self.result['star'] = each('.pingfen').text()
            self.crawl(each('.list-poster a[href^="http"]').attr.href, callback=self.film_detail_page, save=self.result,priority=1)
        self.crawl(response.doc('.next-page a').attr('href'), callback=self.index_page)
    def film_detail_page(self, response):
        self.result = response.save
        for each in response.doc('.mplay-list a').items():
            self.crawl(each.attr.href, callback=self.film_video_page, save=self.result)
    def film_video_page(self, response):
        self.result = response.save
        self.result['title'] = self.response.doc('.player_box>strong').text()
        self.result['url'] = self.response.url
        #print(self.get_taskid(self.task))
        for each in response.doc('script').items():
            self.search = re.search(r'var play_type="(\w+)",vid="(\w+)";',each.text())
            if self.search:
                self.result['vtype'] = self.search.group(1)
                self.result['vid'] = self.search.group(2)
        return self.result

0

评论 (0)

取消