Python爬虫循序渐进实例入门

Beautiful Soup提供一些简单的、python式的函数用来处理导航、搜索、修改分析树等功能。它是一个工具箱,通过解析文档为用户提供需要抓取的数据,因为简单,所以不需要多少代码就可以写出一个完整的应用程序。

Beautiful Soup自动将输入文档转换为Unicode编码,输出文档转换为utf-8编码。你不需要考虑编码方式,除非文档没有指定一个编码方式,这时,Beautiful Soup就不能自动识别编码方式了。然后,你仅仅需要说明一下原始编码方式就可以了。

Beautiful Soup已成为和lxml、html6lib一样出色的python解释器,为用户灵活地提供不同的解析策略或强劲的速度。

更多:爬虫

目录:
超级简单版
稍微复杂版
更复杂版
Selenium版
js方案

超级简单版

适用于静态网页,没有太多反扒机制的网页。来个例子,批量下载妹子图。现在可能不能直接用,要稍微改下,这是很久前写的。用到的就是最最基本知识。当然有点心急,这是多线程。

#-*- coding:utf-8 -*-
'''
Created on 2015年5月25日
获取图片
使用步骤
依赖 BeautifulSoup
ai8py.com
'''
import uuid
import urllib
from bs4 import BeautifulSoup
import threading
import os
dist = r"f:\\photo\\"
if  not os.path.exists(dist):
    os.mkdir(dist)
def parse_first():
    """解析首页"""
    url ="http://www.ai8py.com/"
    html = urllib.urlopen(url).read()
    soup = BeautifulSoup(html)
    soup.find_all('a',target="_blank")
    alist =soup.find('div',class_='tags').find_all('a')
    hrefs = [row.get('href') for row in alist]
    return hrefs
def parse_page(soup):
    """获得分页链接"""
    div=soup.find('div',id="wp_page_numbers")
    page_list=[]
    if div:
        alist= div.find_all('a')
        page_list = [row.get('href') for row in alist]
        page_list = ["http://www.ai8py.com/a/"+url for url in page_list]
    return page_list
def page_subpage(soup):
    """获得详细页链接"""
    page_list = []
    div = soup.find_all('div',class_="pic")
    for row in div:
        a = row.find('a')
        href = a.get('href')
        page_list.append(href)
    return page_list
def getimage(all_list):
    """解析一个页面"""
    all_list = list(set(all_list))
    for url in all_list:
        html = urllib.urlopen(url)
        soup = BeautifulSoup(html)
        div = soup.find_all('div', id="picture")[0]
        imgs = div.find_all('img')
        srcs = [row.get('src') for row in imgs]
        for src in srcs:
            name= dist+threading.currentThread().getName()+uuid.uuid4().hex+'.jpg'
            print name
            urllib.urlretrieve(src,name)
def handle_subpage(first_urls):
    """逻辑控制"""
    all_page_list = []
    for url in first_urls:
        html = urllib.urlopen(url)
        soup = BeautifulSoup(html)
        page_list = parse_page(soup)
        all_page_list.extend(page_list)
    print all_page_list
    for url in all_page_list :
        try:
            html = urllib.urlopen(url)
        except Exception, e:
            print e.getMessage()
            continue
        soup = BeautifulSoup(html)
        ditali_list = page_subpage(soup)
        if ditali_list:
            ditali_list = list(set(ditali_list))
            getimage(ditali_list)
    threading.current_thread().stop()
if __name__=='__main__':
    first_list =parse_first()
    print len(first_list)
    perlen = len(first_list)/4+1
    threads=[]
    for i in range(4):
        if i==3:
            subs = first_list[3*perlen:len(first_list)]
        else: 
            subs = first_list[i:(i+1)*perlen]
        th = threading.Thread(target=handle_subpage,name="thread"+str(i),args=(subs,))
        threads.append(th)
    if threads:
        for thr in threads:
            thr.start()
稍微复杂版

复杂的地方在于需要分析网站的请求的来龙去脉,这部分需要对http 协议有所了解,还需要比较熟悉Chorme 或Firefox 等一些浏览器的调试工具的使用.下面是一个世纪佳缘的爬虫举个例子

#ai8py.com
rom pymongo import MongoClient
import requests
import json
import model
import util
import Queue
import copy
import random
import time
class Spider():
    def __init__(self):
        self.headers = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/43.0.2357.130 Chrome/43.0.2357.130 Safari/537.36',
                        }
        #self.headers['Cookie']=cookies
        self.post_data = {
            'sex': 'f',
            'key': '',
            'stc': '1:11,2:18.28,3:155.170,23:1',
            'sn': 'default',
            'sv': 1,
            'p': 1,
            'f': '',
            'listStyle': 'bigPhoto',
            'pri_uid': 0,
            'jsversion': 'v5'
        }
        self.citys = Queue.Queue()
        citys = util.getcitys()
        for city in citys:
            self.citys.put(city)
        self.save_file = util.FileUtile()
        self.starturl = 'http://search.jiayuan.com/v2/search_v2.php'
        self.session = requests.Session()
        self.session.headers = self.headers
        collient = MongoClient()
        db = collient['jiayuan']
        self.collection = db['ai8py']
    def getresource(self):
        totalpage = 1000
        repeatpages = 0
        for page in xrange(1,totalpage+1):
            if repeatpages>=2:
                return False
            self.post_data['p']=page
            print self.post_data
            response = self.session.post(self.starturl,data=self.post_data)
            result =  response.content
            result = result.replace('##jiayser##','').replace('//','').strip()
            json_result = json.loads(result)
            if totalpage==1000:
                totalpage = json_result.get('pageTotal')
            print json.dumps(json_result,ensure_ascii=False)
            usersinfo = json_result.get('userInfo')
            indbs = 0
            for row in usersinfo:
                temp_ai8py = model.ai8py()
                temp_ai8py._id=row.get('realUid')
                print temp_ai8py._id
                temp_ai8py.infodict = row
                detailurl = 'http://www.jiayuan.com/%d?fxly=search_v2'%(temp_ai8py._id)
                print detailurl
                print self.collection.find({'_id':temp_ai8py._id}).count()
                if self.collection.find({'_id':temp_ai8py._id}).count()>0:
                    indbs+=1
                    if indbs>=20:
                        repeatpages+=1
                    continue
                indbs = 0
                repeatpages =0
                self.collection.save(temp_ai8py.todict())
                detilhtml = requests.get(detailurl)
                self.save_file.save(detilhtml.text,str(temp_ai8py._id))
    def handle(self):
        print self.citys.qsize()
        while not self.citys.empty():
            for startage in range(18,29):
                endage = startage+2
                city = self.citys.get()
                self.post_data['stc'] = '1:%d,2:%d.%d,3:155.170,23:1' % (city,startage,endage)
                print self.post_data
                try:
                   self.getresource()
                except Exception,e:
                    print e
                    continue
if __name__=='__main__':
    spider = Spider()
    spider.handle()
更复杂版

有时候对方网站会设置一些很难获得的Cookie, 比如检查你浏览器窗口大小之类的,就需要有浏览器的存在,对已Python 我使用过的两个一个Ghost.py,一个是Selenium

Ghost.py

#!/usr/bin/env python
#encoding=utf-8
#ai8py.com
from ghost import Ghost
from ghost import TimeoutError
from bs4 import BeautifulSoup
from pymongo import MongoClient
page= None
resourses = None
class Spider():
    def __init__(self):
        self.starturl = 'http://search.zhenai.com/search/getfastmdata.jsps'
        self.headers = {
            'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/43.0.2357.130 Chrome/43.0.2357.130 Safari/537.36',
            'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
        }
        self.gh = Ghost(wait_timeout=30,download_images=False)
        self._openurl()
        client = MongoClient()
        db = client['jiayuan']
        self.collect = db['zhenids']
        self.provinces =['10102000', '10103000', '10101002', '10101201', '10105000', '10104000', '10101000', '10118000', '10131000',
            '10127000', '10107000', '10124000', '10115000', '10112000', '10125000', '10121000', '10120000', '10117000',
            '10114000', '10106000', '10119000', '10113000', '10116000', '10109000', '10111000', '10110000', '10130000',
            '10128000', '10126000', '10108000', '10123000', '10122000', '10129000',]
    def _setcitys(self,province,city):
        """设置城市参数"""
        page,resourses = self.gh.set_field_value('input[id="regworkProvince"]',city)
        page,resourses = self.gh.set_field_value('input[id="regworkCity"]',city)
        page,resourses = self.gh.set_field_value('input[id="areaFormWorkProvince"]',province)
        page,resourses = self.gh.set_field_value('input[id="areaForm_workCity"]',city)
    def _setforms(self,agebegin='18',ageend='28'):
        agebegin = str(agebegin)
        ageend = str(ageend)
        page,resourses = self.gh.fill('#hsmore',{'agebegin':agebegin,'ageend':ageend,'gender':'1'})
        page,resourses = self.gh.set_field_value('#age1_tmp',agebegin)
        page,resourses = self.gh.set_field_value('#age2_tmp',ageend)
    def _openurl(self):
        page,resourses = self.gh.open(self.starturl,method='post',timeout=200,headers=self.headers)
    def _commit_selform(self):
        """提交选择表单"""
        page,resourses = self.gh.call("#hsmore", "submit", expect_loading=True)
        self.gh.wait_for_page_loaded()
    def _gotopage(self,page):
        """翻页"""
        page,resourses = self.gh.evaluate('window.gotopage(%d);'%page,expect_loading=True)
        print len(resourses)
        self.gh.wait_for_page_loaded()
    def _get_userid(self):
        soup = BeautifulSoup(self.gh.content)
        ul = soup.find('ul',class_='search_user_photo clearfix')
        lis = ul.find_all('li')
        las = [li.find('a',target='_blank') for li in lis]
        hrefs = [a.get('href') for a in las]
        print hrefs
        for href in hrefs:
            if self.collect.find({'_id':href}).count()>0:
                continue
            self.collect.save(({'_id':href,'isGet':0}))
    def parse_page(self):
        """解析网页"""
        for prov in self.provinces:
            self._setcitys(prov,'-1')
            self._setforms()
            try:
                self._commit_selform()
            except TimeoutError,e:
                print e
                continue
            pagesoup = BeautifulSoup(self.gh.content)
            citys = pagesoup.find('div',class_='city_box')
            als = citys.find_all('a')
            citys = [a.get('v') for a in als]
            for city in citys:
                self._setcitys(prov,city)
                for agebegin in range(18,29):
                    for ageend in range(18,29):
                        self._setforms(agebegin,ageend)
                        self._commit_selform()
                        self._get_userid()
                        for page in range(2,50):
                            try:
                                self._gotopage(page)
                                self._get_userid()
                            except TimeoutError,e:
                                print e
                            except AttributeError,e:
                                print e
                                break
Selenium版
#!/usr/bin/env python
#encoding=utf-8
#ai8py.com
from selenium import webdriver
from bs4 import BeautifulSoup
from pymongo import MongoClient
import re
class Spider():
    def __init__(self):
        self.webdriver = webdriver.Firefox(timeout=60)
        self.webdriver.get('http://search.zhenai.com/search/getfastmdata.jsps')
        element = self.webdriver.find_element_by_id('sex')
        element.send_keys(1)
        self.provinces =['10102000', '10103000', '10101002', '10101201', '10105000', '10104000', '10101000', '10118000', '10131000',
                 '10127000', '10107000', '10124000', '10115000', '10112000', '10125000', '10121000', '10120000', '10117000',
                 '10114000', '10106000', '10119000', '10113000', '10116000', '10109000', '10111000', '10110000', '10130000',
                 '10128000', '10126000', '10108000', '10123000', '10122000', '10129000',]
        client = MongoClient()
        db = client['jiayuan']
        self.collect = db['zhenids']
    def click_detail_search(self):
        button = self.webdriver.find_element_by_xpath('//a[@class="btn_orange_L"]')
        button.click()
    def click_down_detail(self):
        down_element = self.webdriver.find_element_by_id('searchSlideDown')
        down_element.click()
    def parse_page(self):
        for province in self.provinces:
            self.click_down_detail()
            pro_element = self.webdriver.find_element_by_id('areaFormWorkProvince')
            pro_element.send_keys(province)
            button = self.webdriver.find_element_by_xpath('//a[@class="btn_search_c"]')
            button.click()
            print "province:%s"%province
            pagesoup = BeautifulSoup(self.webdriver.page_source)
            citys = pagesoup.find('div',class_='city_box')
            als = citys.find_all('a')
            citys = [a.get('v') for a in als]
            for city in citys:
                print ("city:%s"%city)
                self.click_down_detail()
                pelement = self.webdriver.find_element_by_id('regworkProvince_tmp')
                pelement.send_keys(province)
                p2element = self.webdriver.find_element_by_id('regworkProvince')
                p2element.send_keys(province)
                c_element = self.webdriver.find_element_by_id('regworkCity')
                c_element.send_keys(city)
                self.click_detail_search()
                for beginage in range(18,29):
                    self.click_down_detail()
                    begin_el = self.webdriver.find_element_by_id('age1')
                    begin_el.send_keys(beginage)
                    self.click_detail_search()
                    for enage in range(18,29):
                        self.click_down_detail()
                        end_el = self.webdriver.find_element_by_id('age2')
                        end_el.send_keys(enage)
                        self.click_detail_search()
                        soup = BeautifulSoup(self.webdriver.page_source)
                        div_a = soup.find('div',id='setpage')
                        if not div_a:
                            continue
                        all_a = div_a.find_all('a')
                        all_num = [int(re.search('\d+',row.get('href')).group()) for row in all_a]
                        total_page = max(all_num)
                        print total_page
                        for page_num in range(1,total_page):
                            try:
                                page_element = self.webdriver.find_element_by_xpath('//a[text()=%d]'%page_num)
                                page_element.click()
                                self._get_userid()
                                if page_num==50:
                                    break
                            except Exception,e:
                                print e
                                break
    def _get_userid(self):
         soup = BeautifulSoup(self.webdriver.page_source)
         ul = soup.find('ul',class_='search_user_photo clearfix')
         lis = ul.find_all('li')
         las = [li.find('a',target='_blank') for li in lis]
         hrefs = [a.get('href') for a in las]
         loops = 0
         print hrefs
         for href in hrefs:
             if self.collect.find({'_id':href}).count()>0:
                 loops+=1
                 if loops==20:
                     raise Exception('allin db')
                 continue
             self.collect.save(({'_id':href,'isGet':0}))
js方案

除了Python 有时候用js 会有比较好的优势,而且有Phantomjs ,Nodejs, Casperjs 比较多的选择,个人感觉灵活性会有很大的缺失,这方面是不能跟Python 比的.

var casper = require('casper').create({
"User-Agent":'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.81 Safari/537.36',
logLevel: "info",
verbose: true
});
casper.start('http://search.zhenai.com/search/getfastmdata.jsps',function(){
this.fill('form#hsmore',{
'gender':1
},true);
});
casper.then(function() {
this.evaluateOrDie(function() {
return /message sent/.test(document.body.innerText);
}, 'sending message failed');
});
casper.run(function() {
this.echo('message sent').exit();

《Python爬虫循序渐进实例入门》有一个想法

发表评论

电子邮件地址不会被公开。 必填项已用*标注