最新消息:

Python问答爬虫,360问答爬虫采集抓取源码

Python爬虫 追逐 197浏览 0评论

Python问答爬虫,360问答爬虫采集抓取源码

 

#采集360问答

import requests
import re
import os

def wd(url):
    headers={
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
    }
    #url="https://wenda.so.com/q/1370372625068529"
    html=requests.get(url,headers=headers)
    html=html.text
    #print(html)

    #获取问题
    wtze=r'<title>(.+?)_360问答</title>'
    wt=re.findall(wtze,html,re.S)
    print(wt)

    #获取问题描述
    wtmsze=r'<div class="bd"><div class="q-cnt">(.+?)</div>'
    wtms=re.findall(wtmsze,html,re.S)

    #获取答案
    daze1=r'<div class="resolved-cnt">(.+?)</div>'
    daze2 = r'<div class="resolved-cnt src-import">(.+?)</div>'
    da=re.findall(daze1,html,re.S)
    if da==[]:
        da=re.findall(daze2,html,re.S)
    print(da)

    if  wtms!=[]:
        wtm=wtms
        wdsj=wt[0]+'\r\n'+wtm[0]+'\r\n'+da[0]+'\r\n'+"--------------"+ '\r\n'
    else:
        wdsj =wt[0]+'\r\n'+da[0]+'\r\n'+"--------------"+ '\r\n'

    os.makedirs("./img/360wd.txt", exist_ok=True)
    with open("./img/360wd.txt" , 'w') as f:
        f.write(wdsj)
    #print(wdsj)
    return(wdsj)

headers={
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
    }
ur="http://wenda.so.com/search/?q=产品设计&pn=1"
htm=requests.get(ur,headers=headers)
htm=htm.text
#print(htm)
ljze=r'<li class="item js-normal-item"><div class="qa-i-hd"><h3><a index="" aId=".+?" target="_blank" href="(.+?)" >'
lj=re.findall(ljze,htm,re.S)
for ljurl in lj:
    ljurl="https://wenda.so.com"+ljurl
    print(ljurl)
    print(wd(ljurl))


 

python图片爬虫,国外概念车图片素材采集

#采集概念车图片

import requests
import re
import os

def cjtp(url):
    headers={
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
        }
    #url="https://www.carbodydesign.com/2019/03/imagine-by-kia-concept-the-design/"
    html=requests.get(url,headers=headers).text
    print(html)

    #获取标题
    btze=r'<title>(.+?)   - Car Body Design</title>'
    bt=re.findall(btze,html,re.S)
    btmc=bt[0]
    btmc = re.sub(r'[\|\/\<\>\:\*\?\\\"]',"_",btmc)  # 剔除不合法字符
    print(btmc)


    os.makedirs("./che/"+btmc+"/", exist_ok=True)

    #获取内容详情
    nrze=r'<div id="article-box" class="text">(.+?)<div class="share-box">'
    nr=re.findall(nrze,html,re.S)
    print(nr)
    nrxq=nr[0]
    print(nrxq)
    with open("./che/"+btmc+"/"+"nrxq.txt", 'w',encoding='utf-8') as f:
        f.write(nrxq)

    #获取图片地址
    tupnrze=r'<noscript>(.+?)</noscript></a></p>'
    tupnr=re.findall(tupnrze,html,re.S)
    print(tupnr)
    tpnr=""
    for x in tupnr:
        print(x)
        tpnr=tpnr+x
        print(tpnr)




    tpze = r'355w, (.+?) 1600w"'
    tp = re.findall(tpze, tpnr, re.S)
    print(tp)
    i=0
    for tpurl in tp:
        tpurl="https://www.carbodydesign.com"+tpurl
        i=i+1
        print(tpurl)
        tpm=tpurl[-4:]
        print(tpm)
        r = requests.get(tpurl,headers=headers)
        with open("./che/"+btmc+"/" + str(i) + tpm, 'wb') as f:
            f.write(r.content)
        print(str(i) + tpm + "---已保存!")


headers={
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
        }
for n in range(1,110):
    #ur="https://www.carbodydesign.com/news/concept-cars/page/2/"
    ur=f'https://www.carbodydesign.com/news/concept-cars/page/{n}/'
    print(ur)
    htm=requests.get(ur,headers=headers).text
    print(htm)
    ljze=r'''<div class="cbd-column item-post-grid">
   <a href="(.+?)"><img src=".+?" width="236" height="177" alt=".+?" class="tn" /></a>    '''
    lj=re.findall(ljze,htm,re.S)
    print(lj)
    for x in lj:
        print(x)
        cjtp(x)

转载请注明:二爷记 » Python问答爬虫,360问答爬虫采集抓取源码

发表我的评论
取消评论
表情

Hi,您需要填写昵称和邮箱!

  • 昵称 (必填)
  • 邮箱 (必填)
  • 网址