关键词爬虫,Python花瓣画板关键词采集存储数据库

想要寻找图片的小伙伴们应该不会错过这个网站,对,没错,就是花瓣网,各种图片应有尽有,而花瓣网的推荐画板里面的词还是非常不错的,可惜被和谐了不少,想要采集花瓣画板的词,python爬虫当然是没问题的,花瓣的数据比较有意思!

pic_001.jpg

查询源码,有点类似数据接口

app.page["explores"] = [{"keyword_id":1541, "name":"创意灯具", "urlname":"chuangyidengju", "cover":{"farm":"farm1", "bucket":"hbimg", "key":"f77b1c1df184ce91ff529a4d0b5211aa883872c91345f-tdQn2g", "type":"image/jpeg", "width":468, "height":702, "frames":1, "file_id":15723730}, "

想了下还是用正则获取比较简单方便!

正则

explores=re.findall(r'app.page\["explores"\] = \[(.+?)\];.+?app.page\["followers"\]',html,re.S)[0]

这里需要注意转义符

源码:

#花瓣推荐画报词采集
#20200314 by 微信:huguo00289
# -*- coding: UTF-8 -*-

from fake_useragent import UserAgent
import requests,re,time
from csql import Save


key_informations=[]

def search(key,keyurl):
    print(f"正在查询: {key}")
    ua = UserAgent()
    headers = {"User-Agent": ua.random}
    url=f"https://huaban.com/explore/{keyurl}/"
    html=requests.get(url,headers=headers).content.decode("utf-8")
    time.sleep(2)
    if 'app.page["category"]' in html:
        #print(html)
        explores=re.findall(r'app.page\["explores"\] = \[(.+?)\];.+?app.page\["followers"\]',html,re.S)[0]
        #print(explores)
        keyfins=re.findall(r', "name":"(.+?)", "urlname":"(.+?)",',explores,re.S)
        print(keyfins)
        sa=Save(keyfins)
        sa.sav()
        for keyfin in keyfins:
            if keyfin not in key_informations:
                key_informations.append(keyfin)
                search(keyfin[0], keyfin[1])


        print(len(key_informations))
    else:
        print(f"查询关键词{key}不是工业设计分类,放弃查询!")
        pass

print(len(key_informations))
print(key_informations)

search('3D打印', '3dp')

函数调用本身,不断循环访问网页获取数据!

花瓣网画板词采集

pic_002.jpg

数据是下拉加载,ajax数据加载

同时有一个规律,那就是下一个下拉max是最后一个花瓣seq!

源码:

#花瓣画报词采集
#20200320 by 微信:huguo00289
# -*- coding: UTF-8 -*-


from csql import Save
import requests,json,time


def get_board(id):
    headers={
        'Cookie': 'UM_distinctid=170c29e8d8f84f-0b44fc835bc8e3-43450521-1fa400-170c29e8d903de; CNZZDATA1256914954=1367860536-1583810242-null%7C1583837292; _uab_collina=158415646085953266966037; __auc=30586f3f170d7154a5593583b24; __gads=ID=28115786a916a7a1:T=1584156505:S=ALNI_MbtohAUwMbbd5Yoa5OBBaSO0tSJkw; _hmt=1; sid=s%3AkwSz9iaMxZf-XtcJX9rrY4ltNDbqkeYs.bc8fvfAq6DLGxsRQ6LF9%2FmHcjOGIhRSZC0RkuKyHd7w; referer=https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3Df1FbGruB8SzQQxEDyaJ_mefz-bVnJFZJaAcQYJGXTZq%26wd%3D%26eqid%3Dda22ff4e0005f208000000065e74adf2; uid=29417717; _f=iVBORw0KGgoAAAANSUhEUgAAADIAAAAUCAYAAADPym6aAAABJ0lEQVRYR%2B1VuxHCMAyVFqKjomEjVgkb0VDRMQgrmJMdBcUn2VbAXDiSJpb9%2FHl6%2BiCEEAAAAiL9AJP5sgHSQuMXAOIB6NxXO354DOlhxodMhB8vicQxjgxrN4l1IrMRMRzmVkSeQ4pMIUdRp4RNaU4LsRzPNt9rKekmooWWDJVvjqVTuxKJeTWqJL1vkV2CZzJdifRWZ5EitfJrxbI2r6nEj8rxs5w08pAwLkXUgrGg%2FDoqdTN0IzK5ylAkXG6pgx%2F3sfPntuZqxsh9JUkk%2Fry7FtWbdXZvaNFFkgiPLRJyXe5txZfIbEQ4nMjLNe9K7FS9hJqrUeTnibQm%2BeoV0R5olZZctZqKGr5bsnuISPXy8muRssrv6X6AnNRbVau5LX8A%2BDed%2FQkRsJAorSTxBAAAAABJRU5ErkJggg%3D%3D%2CWin32.1920.1080.24; Hm_lvt_d4a0e7c3cd16eb58a65472f40e7ee543=1584330161,1584348316,1584516528,1584705015; __asc=c7dc256a170f7c78b1b2b6abc60; CNZZDATA1256903590=1599552095-1584151635-https%253A%252F%252Fwww.baidu.com%252F%7C1584704759; _cnzz_CV1256903590=is-logon%7Clogged-in%7C1584705067566%26urlname%7Cxpmvxxfddh%7C1584705067566; Hm_lpvt_d4a0e7c3cd16eb58a65472f40e7ee543=1584705067',
        'Referer': 'https://huaban.com/discovery/industrial_design/boards/',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
        'X-Request': 'JSON',
        'X-Requested-With': 'XMLHttpRequest',
    }

    url="https://huaban.com/discovery/industrial_design/boards/?k804hb1m&max=%s&limit=20&wfl=1" % id
    html=requests.get(url,headers=headers,timeout=8).content.decode('utf-8')
    time.sleep(1)
    if html:
        req=json.loads(html)
        print(req)
        boards=req['boards']
        print(len(boards))
        for board in boards:
            print(board['title'])
            sa = Save(board['title'])
            sa.sav2()
            #print(board['seq'])

        next_id=boards[-1]['seq']
        get_board(next_id)


if __name__ == '__main__':
    id="1584416341304281760"
    while True:
        get_board(id)


使用 while 循环 以及自身循环

 

最后保存到数据库

源码

import pymysql

class Save(object):
    def __init__(self,key):
        self.host="localhost"
        self.user="root"
        self.password="123456"
        self.db="xiaoshuo"
        self.port=3306
        self.connect = pymysql.connect(
            host=self.host,
            user=self.user,
            password=self.password,
            db=self.db,
            port=self.port,
        )
        self.cursor = self.connect.cursor()  # 设置游标
        self.key=key

    def insert(self):
        for keyword in self.key:
            try:
                sql="INSERT INTO huaban(keyword)VALUES(%s)"
                val = (keyword[0])
                self.cursor.execute(sql, val)
                self.connect.commit()
                print(f'>>> 插入 {keyword[0]} 数据成功!')
            except Exception as e:
                print(e)
                print(f'>>> 插入 {keyword[0]} 数据失败!')


    def insert2(self):
        keyword=self.key
        try:
            sql="INSERT INTO huaban2(keyword)VALUES(%s)"
            val = keyword
            self.cursor.execute(sql, val)
            self.connect.commit()
            print(f'>>> 插入 {keyword} 数据成功!')
        except Exception as e:
            print(e)
            print(f'>>> 插入 {keyword} 数据失败!')

    def cs(self):
        # 关闭数据库
        self.cursor.close()
        self.connect.close()



    def sav(self):
        self.insert()
        self.cs()


    def sav2(self):
        self.insert2()
        self.cs()