http://www.nendo.jp/en/release/2020/
#四次重试 def get_req(self,url,headers,timeout=8,num_retries=4): print(f">>正在爬取{url}") try: response=requests.get(url,headers=headers,timeout=timeout) time.sleep(2) except: if num_retries > 0: ##num_retries是我们限定的重试次数 time.sleep(6) ##延迟六秒 print(u'获取网页出错,8S后将获取倒数第:', num_retries, u'次') return self.get_req(url, headers,timeout, num_retries - 1) else: print(f">> 访问 {url} 失败!") with open("fail_url.txt", 'a+', encoding='utf-8') as f: f.write(f'{url}\n') print(f'保存访问失败网页链接成功!') response=[] return respons
#多线程下载图片 def dowm_imgs(self,img_urls,path): threadings=[] for img_url in img_urls: t= threading.Thread(target=self.get_img,args=(img_url,path)) threadings.append(t) t.start() for x in threadings: x.join() print("多线程下载图片完成"
#获取详情页数据 def parse(self,url,year): r = self.get_req(url, self.headers, timeout=8, num_retries=4) html = r.content.decode('utf-8') tree = etree.HTML(html) if len(tree.xpath('//div[@class="entry-content"]/p'))>0: print("该页面为详情页面") h1 = tree.xpath('//h1[@class="entry-title"]/text()')[0] pattern = r"[\/\\\:\*\?\"\<\>\|]" title = re.sub(pattern, "_", h1) # 替换为下划线,去除待存储目录特殊字符 print(title) path = f'{year}/{title}/' os.makedirs(path, exist_ok=True) imgs = tree.xpath('//div[@class="entry-content"]//img/@src') print(imgs) self.dowm_imgs(imgs, path) else: print("该页面非详情页面") self.get_parseurls(url, year)
转载请注明:二爷记 » Python爬虫,超简单nendo官网作品图片爬虫demo