想要寻找图片的小伙伴们应该不会错过这个网站,对,没错,就是花瓣网,各种图片应有尽有,而花瓣网的推荐画板里面的词还是非常不错的,可惜被和谐了不少,想要采集花瓣画板的词,python爬虫当然是没问题的,花瓣的数据比较有意思!
查询源码,有点类似数据接口
app.page["explores"] = [{"keyword_id":1541, "name":"创意灯具", "urlname":"chuangyidengju", "cover":{"farm":"farm1", "bucket":"hbimg", "key":"f77b1c1df184ce91ff529a4d0b5211aa883872c91345f-tdQn2g", "type":"image/jpeg", "width":468, "height":702, "frames":1, "file_id":15723730}, "
想了下还是用正则获取比较简单方便!
正则
explores=re.findall(r'app.page\["explores"\] = \[(.+?)\];.+?app.page\["followers"\]',html,re.S)[0]
这里需要注意转义符
源码:
#花瓣推荐画报词采集 #20200314 by 微信:huguo00289 # -*- coding: UTF-8 -*- from fake_useragent import UserAgent import requests,re,time from csql import Save key_informations=[] def search(key,keyurl): print(f"正在查询: {key}") ua = UserAgent() headers = {"User-Agent": ua.random} url=f"https://huaban.com/explore/{keyurl}/" html=requests.get(url,headers=headers).content.decode("utf-8") time.sleep(2) if 'app.page["category"]' in html: #print(html) explores=re.findall(r'app.page\["explores"\] = \[(.+?)\];.+?app.page\["followers"\]',html,re.S)[0] #print(explores) keyfins=re.findall(r', "name":"(.+?)", "urlname":"(.+?)",',explores,re.S) print(keyfins) sa=Save(keyfins) sa.sav() for keyfin in keyfins: if keyfin not in key_informations: key_informations.append(keyfin) search(keyfin[0], keyfin[1]) print(len(key_informations)) else: print(f"查询关键词{key}不是工业设计分类,放弃查询!") pass print(len(key_informations)) print(key_informations) search('3D打印', '3dp')
函数调用本身,不断循环访问网页获取数据!
花瓣网画板词采集
数据是下拉加载,ajax数据加载
同时有一个规律,那就是下一个下拉max是最后一个花瓣seq!
源码:
#花瓣画报词采集 #20200320 by 微信:huguo00289 # -*- coding: UTF-8 -*- from csql import Save import requests,json,time def get_board(id): headers={ 'Cookie': 'UM_distinctid=170c29e8d8f84f-0b44fc835bc8e3-43450521-1fa400-170c29e8d903de; CNZZDATA1256914954=1367860536-1583810242-null%7C1583837292; _uab_collina=158415646085953266966037; __auc=30586f3f170d7154a5593583b24; __gads=ID=28115786a916a7a1:T=1584156505:S=ALNI_MbtohAUwMbbd5Yoa5OBBaSO0tSJkw; _hmt=1; sid=s%3AkwSz9iaMxZf-XtcJX9rrY4ltNDbqkeYs.bc8fvfAq6DLGxsRQ6LF9%2FmHcjOGIhRSZC0RkuKyHd7w; referer=https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3Df1FbGruB8SzQQxEDyaJ_mefz-bVnJFZJaAcQYJGXTZq%26wd%3D%26eqid%3Dda22ff4e0005f208000000065e74adf2; uid=29417717; _f=iVBORw0KGgoAAAANSUhEUgAAADIAAAAUCAYAAADPym6aAAABJ0lEQVRYR%2B1VuxHCMAyVFqKjomEjVgkb0VDRMQgrmJMdBcUn2VbAXDiSJpb9%2FHl6%2BiCEEAAAAiL9AJP5sgHSQuMXAOIB6NxXO354DOlhxodMhB8vicQxjgxrN4l1IrMRMRzmVkSeQ4pMIUdRp4RNaU4LsRzPNt9rKekmooWWDJVvjqVTuxKJeTWqJL1vkV2CZzJdifRWZ5EitfJrxbI2r6nEj8rxs5w08pAwLkXUgrGg%2FDoqdTN0IzK5ylAkXG6pgx%2F3sfPntuZqxsh9JUkk%2Fry7FtWbdXZvaNFFkgiPLRJyXe5txZfIbEQ4nMjLNe9K7FS9hJqrUeTnibQm%2BeoV0R5olZZctZqKGr5bsnuISPXy8muRssrv6X6AnNRbVau5LX8A%2BDed%2FQkRsJAorSTxBAAAAABJRU5ErkJggg%3D%3D%2CWin32.1920.1080.24; Hm_lvt_d4a0e7c3cd16eb58a65472f40e7ee543=1584330161,1584348316,1584516528,1584705015; __asc=c7dc256a170f7c78b1b2b6abc60; CNZZDATA1256903590=1599552095-1584151635-https%253A%252F%252Fwww.baidu.com%252F%7C1584704759; _cnzz_CV1256903590=is-logon%7Clogged-in%7C1584705067566%26urlname%7Cxpmvxxfddh%7C1584705067566; Hm_lpvt_d4a0e7c3cd16eb58a65472f40e7ee543=1584705067', 'Referer': 'https://huaban.com/discovery/industrial_design/boards/', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', 'X-Request': 'JSON', 'X-Requested-With': 'XMLHttpRequest', } url="https://huaban.com/discovery/industrial_design/boards/?k804hb1m&max=%s&limit=20&wfl=1" % id html=requests.get(url,headers=headers,timeout=8).content.decode('utf-8') time.sleep(1) if html: req=json.loads(html) print(req) boards=req['boards'] print(len(boards)) for board in boards: print(board['title']) sa = Save(board['title']) sa.sav2() #print(board['seq']) next_id=boards[-1]['seq'] get_board(next_id) if __name__ == '__main__': id="1584416341304281760" while True: get_board(id) 使用 while 循环 以及自身循环 最后保存到数据库 源码 import pymysql class Save(object): def __init__(self,key): self.host="localhost" self.user="root" self.password="123456" self.db="xiaoshuo" self.port=3306 self.connect = pymysql.connect( host=self.host, user=self.user, password=self.password, db=self.db, port=self.port, ) self.cursor = self.connect.cursor() # 设置游标 self.key=key def insert(self): for keyword in self.key: try: sql="INSERT INTO huaban(keyword)VALUES(%s)" val = (keyword[0]) self.cursor.execute(sql, val) self.connect.commit() print(f'>>> 插入 {keyword[0]} 数据成功!') except Exception as e: print(e) print(f'>>> 插入 {keyword[0]} 数据失败!') def insert2(self): keyword=self.key try: sql="INSERT INTO huaban2(keyword)VALUES(%s)" val = keyword self.cursor.execute(sql, val) self.connect.commit() print(f'>>> 插入 {keyword} 数据成功!') except Exception as e: print(e) print(f'>>> 插入 {keyword} 数据失败!') def cs(self): # 关闭数据库 self.cursor.close() self.connect.close() def sav(self): self.insert() self.cs() def sav2(self): self.insert2() self.cs()