最新消息:

Python爬虫,python台湾金点设计奖数据采集源码

Python爬虫 追逐 182浏览 0评论

渣渣写的爬虫,也是以前就写过的一个网站,采集官方数据,应用python进行数据采集抓取,同时进行了try.except报错处理,算是可以运行完毕的爬虫,同时将报错相关记录写入到了txt文件中,可以进行后续报错中断数据的补采集操作!

目标网址: https://www.goldenpin.org.tw/%E9%87%91%E9%BB%9E%E8%A8%AD%E8%A8%88%E7%8D%8E/?y=2020

金点设计奖

 

想要获取一个网站的数据,在排除反爬限制的前提下,就是请求访问的中断处理和完善,偶尔服务器访问请求中断或者报错,以及节点获取的错误,都可以考虑进去,防止爬虫中断,当然采集数据建议还是链接数据库保存相关数据信息内容,不管是后面补采集还是其他都更加方便处理操作!

几个关键点
  • requests访问超时封装处理
from requests.adapters import HTTPAdapter


self.s = requests.Session()
self.s.mount('http://', HTTPAdapter(max_retries=5))
self.s.mount('https://', HTTPAdapter(max_retries=5))

更改数字,可以设置重试次数!

  • 格式化数据
for href,category in zip (hrefs,categorys):
    print(href,category)

for in zip 函数的使用可自行百度参考方法!

  • 标题格式化数据处理
title=req.xpath('//h1[@class="entry-title"]/text()')[0]
pattern = r"[\/\\\:\*\?\"\<\>\|]"
h1 = re.sub(pattern, "_", title)  # 替换为下划线

标题作为数据存储的文件名,需要进行格式化处理,将非法字符替换为下划线!

完整源码参考:
#金点奖设计采集
#20201118 @author:WX:huguo00289


# -*- coding: UTF-8 -*-
import requests,re,time
from fake_useragent import UserAgent
import json,os
from lxml import etree
from requests.adapters import HTTPAdapter




class Gd(object):
    def __init__(self):
        self.ua=UserAgent()
        self.headers={
            'User-Agent':self.ua.random,
            #Cookie: PHPSESSID=t9gp0d4ebv684u4miotr4edkba; devicePixelRatio=1; _ga=GA1.3.657336680.1605679856; _gid=GA1.3.569339568.1605679856; _gat_gtag_UA_55240876_38=1
        }
        self.url="https://www.goldenpin.org.tw/ft-admin/admin-ajax.php"
        self.s = requests.Session()
        self.s.mount('http://', HTTPAdapter(max_retries=5))
        self.s.mount('https://', HTTPAdapter(max_retries=5))






    def get_content(self,i):
        data={
            'action': 'presscore_template_ajax',
            'postID': '15317',
            'paged': 1,
            'targetPage': i,
            'term': '',
            'orderby': '',
            'order': '',
            'nonce': '004811f2a4',
            'contentType': 'portfolio',
            'pageData[type]': 'page',
            'pageData[template]': 'portfolio',
            'pageData[layout]': 'masonry',
            'sender': 'more',
        }
        response=self.s.post(self.url,timeout=8,headers=self.headers,data=data)
        resq=response.content.decode('utf-8')
        req = json.loads(resq)
        print(req['success'])
        if req['success'] == True:
            html=req['html']
            #print(html)
            hrefs=re.findall(r'<a target="_blank"  href="(.+?)" class="alignnone rollover layzr-bg"',html,re.S)
            print(len(hrefs))
            categorys=re.findall(r'<div class="entry-meta portfolio-categories"><span class="category-link"><a href=".+?" >(.+?)</span></div></div>',html,re.S)
            print(len(categorys))
            for href,category in zip (hrefs,categorys):
                print(href,category)
                try:
                    self.parse(href,category)
                except Exception as e:
                    print(f'采集网页:{href} 出错,错误代码:{e}')
                    with open('href_fail.txt', 'a+', encoding='utf-8') as f:
                        f.write(f'{href},{category}\n')






    def parse(self,href,category):
        print(f'>> 正在采集网页 {href} 数据..')
        html=self.s.get(href,headers=self.headers,timeout=8).content.decode('utf-8')
        req=etree.HTML(html)
        title=req.xpath('//h1[@class="entry-title"]/text()')[0]
        pattern = r"[\/\\\:\*\?\"\<\>\|]"
        h1 = re.sub(pattern, "_", title)  # 替换为下划线
        path = f'{category}/{h1}/'
        infos=req.xpath('//div[@class="wpb_wrapper"]//text()')
        info=''.join(infos)
        with open(f'{category}.txt','a+',encoding='utf-8') as f:
            f.write(f'{title}\n{info}\n\n')
        imgs=req.xpath('//div[@class="fancy-media-wrap layzr-bg"]/img[@class="lazy-load preload-me"]/@data-src')
        print(h1,info,imgs)
        try:
            self.downs(imgs,path)
        except Exception as e:
            print(f'获取图片:{imgs} 出错,错误代码:{e}')
            with open('imgs_fail.txt', 'a+', encoding='utf-8') as f:
                f.write(f'{imgs},{path}\n')






    def downs(self,imgs,path):
        os.makedirs(path, exist_ok=True)
        for img in imgs:
            img_url=img
            img_name=img.split('/')[-1]
            print(img_url,img_name)
            try:
                self.dwon_img(img_url,img_name,path)
            except Exception as e:
                print(f'下载图片:{img_name} 出错,错误代码:{e}')
                with open('img_fail.txt','a+',encoding='utf-8') as f:
                    f.write(f'{img_url},{img_name},{path}\n')






    def dwon_img(self,img_url,img_name,path):
        print(f'>> 正在下载图片:{img_name} ..')
        r=self.s.get(img_url,timeout=8,headers=self.headers)
        with open(f'{path}{img_name}','wb') as f:
            f.write(r.content)
        print(f'>> 图片:{img_name} 下载完成!')




    def run(self):
        for i in range(1,1000):
            print(f'>> 正在爬取第 {i} 页数据..')
            try:
                self.get_content(i)
            except Exception as e:
                print(f'爬取第 {i} 页数据出错,错误代码:{e}')
                with open('list_fail.txt','a+',encoding='utf-8') as f:
                    f.write(f'{i}\n')






if __name__=='__main__':
    spider=Gd()
数据补采集操作源码参考:
#金点奖设计采集补采集


# -*- coding: UTF-8 -*-
from gdspider import Gd




#补图片
def get_bimg_fail():
    path=r'bimg_fail.txt'
    with open(path,'r',encoding='utf-8') as f:
        img_fails=f.readlines()


    print(len(img_fails))
    spider=Gd()
    for img_fail in img_fails:
        img_fail=img_fail.strip()
        img_fail=img_fail.split(',')
        print(img_fail)
        spider.dwon_img(img_fail[0],img_fail[1],img_fail[2])






#补连接
def get_href_fail():
    path=r'bhref_fail.txt'
    with open(path,'r',encoding='utf-8') as f:
        href_fails=f.readlines()


    print(len(href_fails))
    spider=Gd()
    for href_fail in href_fails:
        href_fail=href_fail.strip()
        href_fail=href_fail.split(",")
        href=href_fail[0]


        if "<a href=" in str(href_fail):
            category=href_fail[-1].split('/')[-1]
            category=category.strip()
            category = category.replace('" >', '')
        else:
            category=href_fail[1]


        href=href.replace(' https','https')
        print(href,category)
        spider.parse(href,category)






def get_blist():
    path = r'blist_fail.txt'
    with open(path, 'r', encoding='utf-8') as f:
        blist_fails = f.readlines()


    print(len(blist_fails))
    spider = Gd()
    for blist_fail in blist_fails:
        blist_fail=blist_fail.strip()
        print(blist_fail)
        spider.get_content(blist_fail)








if __name__=='__main__':
    get_bimg_fail()
    #get_href_fail()

关注本渣渣微信公众号:二爷记

微信公众号:二爷记

后台回复关键字:“金点设计奖”

获取完整项目

 

转载请注明:二爷记 » Python爬虫,python台湾金点设计奖数据采集源码

发表我的评论
取消评论
表情

Hi,您需要填写昵称和邮箱!

  • 昵称 (必填)
  • 邮箱 (必填)
  • 网址