pyspider爬虫爬取电影网站代码

dhso
2017/11/30 15:52
统计中
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2017-11-30 15:46:23
# Project: ttwanda_3

from pyspider.libs.base_handler import *
import re
import json
from pyspider.libs.utils import md5string

class Handler(BaseHandler):
    crawl_config = {
    }

    @every(minutes=24 * 60)
    def on_start(self):
        self.crawl('http://www.ttwanda.com', callback=self.index_page)

    @config(age=10 * 24 * 60 * 60)
    def index_page(self, response):
        for each in response.doc('a[href^="http"]').items():
            if re.match(u"http://www.ttwanda.com/film/page/\d+|http://www.ttwanda.com/film$", each.attr.href): 
                self.result={}
                self.crawl(each.attr.href, callback=self.film_list_page, save=self.result)
    def film_list_page(self, response):
        for each in response.doc('article.u-movie').items():
            self.result = response.save
            self.result['poster'] = each('img').attr['data-original']
            self.result['star'] = each('.pingfen').text()
            self.crawl(each('.list-poster a[href^="http"]').attr.href, callback=self.film_detail_page, save=self.result,priority=1)
        self.crawl(response.doc('.next-page a').attr('href'), callback=self.index_page)

    def film_detail_page(self, response):
        self.result = response.save
        for each in response.doc('.mplay-list a').items():
            self.crawl(each.attr.href, callback=self.film_video_page, save=self.result)
    
    def film_video_page(self, response):
        self.result = response.save
        self.result['title'] = self.response.doc('.player_box>strong').text()
        self.result['url'] = self.response.url
        #print(self.get_taskid(self.task))
        for each in response.doc('script').items():
            self.search = re.search(r'var play_type="(\w+)",vid="(\w+)";',each.text())
            if self.search:
                self.result['vtype'] = self.search.group(1)
                self.result['vid'] = self.search.group(2)
        
        return self.result

本文为 dhso 原创

发布在 http://blog.minws.com/pyspiderpa-chong-pa-qu-dian-ying-wang-zhan-dai-ma/

如有转载,请标明来源!

作者信息
姓名:dhso
热评文章
最新评论
文章概览