最新消息:

爬虫脚本,Python简单图片爬虫案例

Python爬虫 追逐 114浏览 0评论

现在很多网站都是异步加载的方式加载数据,大部分都是json数据,如果不知道数据的传递过程,一些参数理不清头绪的话,又想要获取数据,那就比较难搞了,尤其是对于本渣渣级选手而言。

网站

目标网址

https://www.keyshot.com/gallery/

需求

获取图片信息,需高清大图

 

经过简单浏览器抓包调试,可以获取到一些信息!

数据传递

数据信息

 

不想努力了,想了两个笨方法,好在数据量不大!

枚举法获取图片地址,爬取图片

1.每句获取图片地址

代码示例

for i in range(10000):
    if len(str(i))==1:
        i=f'000{i}'
    if len(str(i))==2:
        i = f'00{i}'
    if len(str(i))==3:
        i = f'0{i}'
    if len(str(i)) ==4:
        i=i

    print(i)
    url=f"https://www.keyshot.com/wp-content/uploads/2016/06/keyshot-gallery-{i}.jpg"
    if requests.get(url, headers=self.random_headers):
        print("存在图片!")

图片链接:https://www.keyshot.com/wp-content/uploads/2016/06/keyshot-gallery-0003.jpg

可以看到id与图片链接是存在关系的,所以,对于id进行迭代,同时进行了if判断!

2.图片下载

代码示例

    def save_img(self, img_url, img_name, path):
        os.makedirs(f'{path}/', exist_ok=True)
        print("开始下载图片!")
        print(f">>> 开始保存 {img_name} 图片")
        r = requests.get(img_url, headers=self.random_headers,timeout=8)
        with open(f'{path}/{img_name}.jpg''wb'as f:
            f.write(r.content)
        print(f">>> 保存 {img_name} 图片成功")

这里需要注意的是 timeout=8 属性一定需要标配,尤其是国外网站获取请求的话,不然容易卡死!

完整代码

# -*- coding: UTF-8 -*-
#微信:huguo00289
import requests
import random,os


class Httprequest(object):
    ua_list = [
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36Chrome 17.0',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0Firefox 4.0.1',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
        'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
        'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
    ]
    @property  #把方法变成属性的装饰器
    def random_headers(self):
        return {
            'User-Agent': random.choice(self.ua_list)
        }


class Get_imgs(Httprequest):
    def __init__(self):
        self.path="key"


    def getimgs(self):
        for i in range(10000):
            if len(str(i))==1:
                i=f'000{i}'
            if len(str(i))==2:
                i = f'00{i}'
            if len(str(i))==3:
                i = f'0{i}'
            if len(str(i)) ==4:
                i=i

            print(i)
            url=f"https://www.keyshot.com/wp-content/uploads/2016/06/keyshot-gallery-{i}.jpg"
            if requests.get(url, headers=self.random_headers):
                print("存在图片!")
                self.save_img(url, str(i), self.path)


    #下载图片
    def save_img(self, img_url, img_name, path):
        os.makedirs(f'{path}/', exist_ok=True)
        print("开始下载图片!")
        print(f">>> 开始保存 {img_name} 图片")
        r = requests.get(img_url, headers=self.random_headers,timeout=8)
        with open(f'{path}/{img_name}.jpg''wb'as f:
            f.write(r.content)
        print(f">>> 保存 {img_name} 图片成功")



if __name__=='__main__':
    spider=Get_imgs()
    spider.getimgs()

手动获取json数据包,爬取图片

1.正则获取图片地址

代码示例

img_urls=[]
    zeimg=r'href="(.+?)"'
    imgs=re.findall(zeimg,str(datas),re.S)
    for img in imgs:
        if "www.keyshot.com" in img:
            img_urls.append(img)

2.多线程下载图片,这里使用了线程池技术

代码示例

def main():
    img_urls=get_imgs()

    try:
        # 开4个 worker,没有参数时默认是 cpu 的核心数
        pool = ThreadPool()
        results = pool.map(save_img, img_urls)
        pool.close()
        pool.join()
        print("采集所有图片完成!")

    except:
        print("Error: unable to start thread")

完整代码


#keyshot图片采集
# -*- coding: UTF-8 -*-
#微信:huguo00289
import requests,re,os,random
from multiprocessing.dummy import Pool as ThreadPool

def get_imgs():
    datas="""
    
    
    data: "<li id="eg-2-post-id-333312" data-skin="keyshot-gallery" class="filterall filter-entertainment eg-keyshot-gallery-wrapper eg-post-id-333312 eg-newli" data-date="1352027697" data-title="dmitrij-le">↵    <div class="esg-media-cover-wrapper">↵<div class="esg-entry-media"><img src="https://www.keyshot.com/wp-content/plugins/essential-grid/public/assets/images/300x200transparent.png" data-lazythumb="https://www.keyshot.com/wp-content/uploads/2016/06/keyshot-gallery-0095-700x1121-25x25.jpg" data-no-lazy="1" data-lazysrc="https://www.keyshot.com/wp-content/uploads/2016/06/keyshot-gallery-0095-700x1121.jpg" alt="" width="700" height="1121"></div>↵↵            <div class="esg-entry-cover esg-transition" data-delay="0" data-duration="deafult" data-clickable="on" data-transition="esg-fade">↵↵<a class="eg-invisiblebutton  esgbox" href="https://www.keyshot.com/wp-content/uploads/2016/06/keyshot-gallery-0095.jpg"  data-width="923"  data-height="1478" ></a>                <div class="esg-overlay esg-transition eg-keyshot-gallery-container" data-delay="0" data-duration="default" data-transition="esg-fade"></div>↵↵    <div class="esg-center eg-post-333312 eg-keyshot-gallery-element-14-a esg-transition" data-delay="0" data-duration="default" data-transition="esg-slideup"><a class="eg-keyshot-gallery-element-14 eg-post-333312" href="javascript:void(0);" target="_self"></a></div>↵              <div class="esg-center eg-keyshot-gallery-element-8 esg-none esg-clear" style="height: 5px; visibility: hidden;"></div>↵    <div class="esg-center eg-post-333312 eg-keyshot-gallery-element-3 esg-transition" data-delay="0.1" data-duration="default" data-transition="esg-flipup">Dmitrij Leppée</div>↵              <div class="esg-center eg-keyshot-gallery-element-9 esg-none esg-clear" style="height: 5px; visibility: hidden;"></div>↵              <div class="esg-center eg-keyshot-gallery-element-11 esg-none esg-clear" style="height: 5px; visibility: hidden;"></div>↵           </div>↵   </div>↵↵</li>↵<li id="eg-2-post-id-333248" data-skin="keyshot-gallery" class="filterall filter-jewelry eg-keyshot-gallery-wrapper eg-post-id-333248 eg-newli" data-date="1351522438" data-title="tiho-ramov">↵    <div class="esg-media-cover-wrapper">↵<div class="esg-entry-media"><img src="https://www.keyshot.com/wp-content/plugins/essential-grid/public/assets/images/300x200transparent.png" data-lazythumb="https://www.keyshot.com/wp-content/uploads/2016/06/keyshot-gallery-0073-700x321-25x25.png" data-no-lazy="1" data-lazysrc="https://www.keyshot.com/wp-content/uploads/2016/06/keyshot-gallery-0073-700x321.png" alt="" width="700" height="321"></div>↵↵            <div class="esg-entry-cover esg-transition" data-delay="0" data-duration="deafult" data-clickable="on" data-transition="esg-fade">↵↵<a class="eg-invisiblebutton  esgbox" href="https://www.keyshot.com/wp-content/uploads/2016/06/keyshot-gallery-0073.png"  data-width="1000"  data-height="458" ></a>                <div class="esg-overlay esg-transition eg-keyshot-gallery-container" data-delay="0" data-duration="default" data-transition="esg-fade"></div>↵↵    <div class="esg-center eg-post-333248 eg-keyshot-gallery-element-14-a esg-transition" data-delay="0" data-duration="default" data-transition="esg-slideup"><a class="eg-keyshot-gallery-element-14 eg-post-333248" href="javascript:void(0);" target="_self"></a></div>↵              <div class="esg-center eg-keyshot-gallery-element-8 esg-none esg-clear" style="height: 5px; visibility: hidden;"></div>↵    <div class="esg-center eg-post-333248 eg-keyshot-gallery-element-3 esg-transition" data-delay="0.1" data-duration="default" data-transition="esg-flipup">Tiho Ramovic</div>↵              <div class="esg-center eg-keyshot-gallery-element-9 esg-none esg-clear" style="height: 5px; visibility: hidden;"></div>↵              <div class="esg-center eg-keyshot-gallery-element-11 esg-none esg-clear" style="height: 5px; visibility: hidden;"></div>↵           </div>↵   </div>↵↵</li>↵<li id="eg-2-post-id-333308" data-skin="keyshot-gallery" class="filterall filter-entertainment eg-keyshot-gallery-wrapper eg-post-id-333308 eg-newli" data-date="1349780210" data-title="vitaly-bul">↵    <div class="esg-media-cover-wrapper">↵<div class="esg-entry-media"><img src="https://www.keyshot.com/wp-content/plugins/essential-grid/public/assets/images/300x200transparent.png" data-lazythumb="https://www.keyshot.com/wp-content/uploads/2012/10/keyshot-gallery-0113-700x1020-25x25.jpg" data-no-lazy="1" data-lazysrc="https://www.keyshot.com/wp-content/uploads/2012/10/keyshot-gallery-0113-700x1020.jpg" alt="" width="700" height="1020"></div>↵↵            <div class="esg-entry-cover esg-transition" data-delay="0" data-duration="deafult" data-clickable="on" data-transition="esg-fade">↵↵<a class="eg-invisiblebutton  esgbox" href="https://www.keyshot.com/wp-content/uploads/2012/10/keyshot-gallery-0113.jpg"  data-width="961"  data-height="1400" ></a>                <div class="esg-overlay esg-transition eg-keyshot-gallery-container" data-delay="0" data-duration="default" data-transition="esg-fade"></div>↵↵    <div class="esg-center eg-post-333308 eg-keyshot-gallery-element-14-a esg-transition" data-delay="0" data-duration="default" data-transition="esg-slideup"><a class="eg-keyshot-gallery-element-14 eg-post-333308" href="javascript:void(0);" target="_self"></a></div>↵              <div class="esg-center eg-keyshot-gallery-element-8 esg-none esg-clear" style="height: 5px; visibility: hidden;"></div>↵    <div class="esg-center eg-post-333308 eg-keyshot-gallery-element-3 esg-transition" data-delay="0.1" data-duration="default" data-transition="esg-flipup">Vitaly Bulgarov</div>↵              <div class="esg-center eg-keyshot-gallery-element-9 esg-none esg-clear" style="height: 5px; visibility: hidden;"></div>↵              <div class="esg-center eg-keyshot-gallery-element-11 esg-none esg-clear" style="height: 5px; visibility: hidden;"></div>↵           </div>↵   </div>↵↵</li>↵<li id="eg-2-post-id-333310" data-skin="keyshot-gallery" class="filterall filter-entertainment eg-keyshot-gallery-wrapper eg-post-id-333310 eg-newli" data-date="1345460494" data-title="maarten-ve">↵    <div class="esg-media-cover-wrapper">↵<div class="esg-entry-media"><img src="https://www.keyshot.com/wp-content/plugins/essential-grid/public/assets/images/300x200transparent.png" data-lazythumb="https://www.keyshot.com/wp-content/uploads/2016/06/keyshot-gallery-0094-700x1017-25x25.jpg" data-no-lazy="1" data-lazysrc="https://www.keyshot.com/wp-content/uploads/2016/06/keyshot-gallery-0094-700x1017.jpg" alt="" width="700" height="1017"></div>↵↵            <div class="esg-entry-cover esg-transition" data-delay="0" data-duration="deafult" data-clickable="on" data-transition="esg-fade">↵↵<a class="eg-invisiblebutton  esgbox" href="https://www.keyshot.com/wp-content/uploads/2016/06/keyshot-gallery-0094.jpg"  data-width="1321"  data-height="1920" ></a>                <div class="esg-overlay esg-transition eg-keyshot-gallery-container" data-delay="0" data-duration="default" data-transition="esg-fade"></div>↵↵    <div class="esg-center eg-post-333310 eg-keyshot-gallery-element-14-a esg-transition" data-delay="0" data-duration="default" data-transition="esg-slideup"><a class="eg-keyshot-gallery-element-14 eg-post-333310" href="javascript:void(0);" target="_self"></a></div>↵              <div class="esg-center eg-keyshot-gallery-element-8 esg-none esg-clear" style="height: 5px; visibility: hidden;"></div>↵    <div class="esg-center eg-post-333310 eg-keyshot-gallery-element-3 esg-transition" data-delay="0.1" data-duration="default" data-transition="esg-flipup">Maarten Verhoeven</div>↵              <div class="esg-center eg-keyshot-gallery-element-9 esg-none esg-clear" style="height: 5px; visibility: hidden;"></div>↵              <div class="esg-center eg-keyshot-gallery-element-11 esg-none esg-clear" style="height: 5px; visibility: hidden;"></div>↵           </div>↵   </div>↵↵</li>↵<li id="eg-2-post-id-333207" data-skin="keyshot-gallery" class="filterall filter-engineering eg-keyshot-gallery-wrapper eg-post-id-333207 eg-newli" data-date="1334153155" data-title="philippe-v">↵    <div class="esg-media-cover-wrapper">↵<div class="esg-entry-media"><img src="https://www.keyshot.com/wp-content/plugins/essential-grid/public/assets/images/300x200transparent.png" data-lazythumb="https://www.keyshot.com/wp-content/uploads/2016/06/keyshot-gallery-0054-700x394-25x25.jpg" data-no-lazy="1" data-lazysrc="https://www.keyshot.com/wp-content/uploads/2016/06/keyshot-gallery-0054-700x394.jpg" alt="" width="700" height="394"></div>↵↵            <div class="esg-entry-cover esg-transition" data-delay="0" data-duration="deafult" data-clickable="on" data-transition="esg-fade">↵↵<a class="eg-invisiblebutton  esgbox" href="https://www.keyshot.com/wp-content/uploads/2016/06/keyshot-gallery-0054.jpg"  data-width="1280"  data-height="720" ></a>                <div class="esg-overlay esg-transition eg-keyshot-gallery-container" data-delay="0" data-duration="default" data-transition="esg-fade"></div>↵↵    <div class="esg-center eg-post-333207 eg-keyshot-gallery-element-14-a esg-transition" data-delay="0" data-duration="default" data-transition="esg-slideup"><a class="eg-keyshot-gallery-element-14 eg-post-333207" href="javascript:void(0);" target="_self"></a></div>↵              <div class="esg-center eg-keyshot-gallery-element-8 esg-none esg-clear" style="height: 5px; visibility: hidden;"></div>↵    <div class="esg-center eg-post-333207 eg-keyshot-gallery-element-3 esg-transition" data-delay="0.1" data-duration="default" data-transition="esg-flipup">Philippe Vanagt</div>↵              <div class="esg-center eg-keyshot-gallery-element-9 esg-none esg-clear" style="height: 5px; visibility: hidden;"></div>↵              <div class="esg-center eg-keyshot-gallery-element-11 esg-none esg-clear" style="height: 5px; visibility: hidden;"></div>↵           </div>↵   </div>↵↵</li>↵"
    message: ""
    success: true
    
    
"""
    img_urls=[]
    zeimg=r'href="(.+?)"'
    imgs=re.findall(zeimg,str(datas),re.S)
    for img in imgs:
        if "www.keyshot.com" in img:
            img_urls.append(img)

    print(len(img_urls))

    return img_urls


#下载图片
def save_img(img_url):
    path = "key"
    ua_list = [
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36Chrome 17.0',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0Firefox 4.0.1',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
        'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
        'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
    ]
    os.makedirs(f'{path}/', exist_ok=True)
    img_name=img_url.split('/')[-1]
    print("开始下载图片!")
    print(f">>> 开始保存 {img_name} 图片")
    r = requests.get(img_url,headers={'User-Agent':random.choice(ua_list)},timeout=8)
    with open(f'{path}/{img_name}''wb'as f:
        f.write(r.content)
    print(f">>> 保存 {img_name} 图片成功")



def main():
    img_urls=get_imgs()

    try:
        # 开4个 worker,没有参数时默认是 cpu 的核心数
        pool = ThreadPool()
        results = pool.map(save_img, img_urls)
        pool.close()
        pool.join()
        print("采集所有图片完成!")

    except:
        print("Error: unable to start thread")




if __name__=='__main__':
    main()
    

转载请注明:二爷记 » 爬虫脚本,Python简单图片爬虫案例

发表我的评论
取消评论
表情

Hi,您需要填写昵称和邮箱!

  • 昵称 (必填)
  • 邮箱 (必填)
  • 网址