python爬虫(无聊写的)

python爬虫(无聊写的)

枫
2023-04-20 / 0 评论 / 130 阅读 / 耗时 171ms / 正在检测是否收录...

介绍

  自行测试,代码大部分都是使用re正则表达式取数据。。。

代码一

import  re
import  requests
import  os
"""
站点:ca789.com 自行测试
"""

pages =int(input('输入你要爬取的页数'))
types =input("输入你想要爬取的类型(toupai、meitui、oumei、katong)>>>:")
if not os.path.exists(f'D:/image/{types}//'):
    os.mkdir((f'D:/image/{types}//'))

num =1
for page in range(1,pages):
    url =f'https://lca789.com/pic/{types}/index_{page}.html'
    # 图片类型可选(toupai、meitui、oumei、卡通)
    print('\n===============正在爬取第',page,'页===============\n\n')
    print('类型:',types)
    data =requests.get(url)
    link =re.findall('<dd><a href="(.*?)" target="_blank"><h3>',data.text)
    for i in link:
        html_url ='https://lca789.com/'+i
        data1 =requests.get(html_url)

        img_url=re.findall("<img src='(.*?)'><br><br>",data1.text)
        title =re.findall("<title>(.*?)</title>",data1.text)[0]
        for i in img_url:
            i = re.sub("'><br><img src=|'> <br><img src='",'',i)
            print(title,i)
            data2 =requests.get(i).content
            with open(f'D://image//{types}//{num}.jpg',mode='wb')as f:
                f.write(data2)
                num +=1

print('\n===============爬取结束===============\n')

代码二

import re
import  requests

headers ={
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54',
    'cookie': 'yabs-sid=1394784121678716100; is_gdpr=0; is_gdpr_b=CJ6rGBDZqwE=; yandexuid=9617169011678716092; yuidss=9617169011678716092; i=BXtIPJOfVAf3+Y2wHz+oX9kKFt7x2/gt1yiZdaR+c0Q3GvRek5COFcuDb8QD5Sz31xEWBn6wPoEfZqXTHuqsfsGEda4=; yp=1678802891.yu.3878431071678716100; ymex=1681308491.oyu.3878431071678716100#1994076100.yrts.1678716100#1994076100.yrtsi.1678716100',

}

url ='https://xn---50ppiccom-4s2r687bes0e.www-50ppic.com/?fuli.one'
data =requests.get(url=url,headers=headers).text
data_url =re.findall('<img src="(.*?)" /></a></div><div class="item">',data)

len_num =len(data_url)
print('==============共',len_num,'张图片==============')
num =1
for i in data_url:
    print('第',num,'图片下载成功',i)
    title = i.split('/')[-1]
    image =requests.get(url=i,headers=headers)
    with open(f'D:/image/{title}''.jpg', mode='wb') as f:
        f.write(image.content)
        num +=1

代码三

import re
import requests
import os

pages =int(input('输入你想要爬取的页数>>>:'))
if not os.path.exists('D:/text/'):
    os.mkdir(('D:/text/'))

for page in range(1,pages):

    print('\n============正在爬取第',page,'页============\n')
    url =f'https://fulizx2.cc/index.php/art/type/id/21/page/{page}.html'
    data =requests.get(url)
    html =re.findall('<a href="(.*?)" title="(.*?)" target="_blank">(.*?)</a>',data.text)
    len_num =len(html)
    print('此页面共',len_num,'篇小说')

    for i in html:
        title= i[1]     #取标题
        html_url=i[0]   #取后缀链接
        html_url='https://fulizx2.cc/'+html_url
        # 网址拼接
        print('《'+title+'》','\t',html_url)
        # 输出打印 标题跟 url
        data1 = requests.get(html_url)
        book =re.findall('<book><p>(.*?)</p></book>',data1.text)
        # re取小说内容
        for i in book:

            text =re.sub('</div>|<br>','\n',i).replace('? ? ','&nbsp;')
            # 字符串替换
            # print(text)
            open(f"D://text//{title}.txt",mode='a',encoding='utf-8').write(text)
                            # 小说标题建文件名
                # f.write('\n')

代码四

import re

import requests
for page in range(1,30):

    url =f'https://m.woyaogexing.com/shouji/index_{page}.html'
    print('\n=================正在爬取第',page,'页=================\n')
    data =requests.get(url)
    data.encoding =data.apparent_encoding
    html_url =re.findall('<div class="m-img-wrap"><a href="(.*?)"',data.text)
    for i in html_url:
        url_html ='https://m.woyaogexing.com'+i
        # print(url_html)
        data1 =requests.get(url_html)
        data1.encoding =data.apparent_encoding
        img_url=re.findall('<a href="//(.*?)" class="swipebox">',data1.text)
        for i in img_url:
            title =re.findall('<h1 class="m-page-title">(.*?)</h1>',data1.text)[0]
            image ='https://'+i
            print(title,image)



import re

import requests
for page in range(1,30):
    url =f'https://m.woyaogexing.com/touxiang/qinglv/index_{page}.html'
    print('\n=================正在爬取第', page, '页=================\n')
    data =requests.get(url)
    data.encoding =data.apparent_encoding
    html =re.findall('<a class="f-bd-4 f-elips" href="(.*?)" alt="(.*?)">',data.text)
    for i in  html:
        title =i[1]
        html_url ='https://m.woyaogexing.com/'+i[0]
        data2 =requests.get(html_url)
        data2.encoding =data2.apparent_encoding
        img_url = re.findall('data-src="//(.*?)"/>',data2.text)

        for i in img_url:
            img_url = 'https://' + i
            print(title,img_url)

俄乌战争局势

import re

import requests

url ='http://app.people.cn/api/v2/subjects/subjectTimelineList?articleId=771&size=20&pageToken=1&_t=1687915057&protocol=false'

r =requests.get(url).json()
json_data =r['item']
for i in json_data:
    articleTitle =i['articleTitle']
    datePoint =i['datePoint']
    datePoint =re.sub('\\+0800','',datePoint)
    datePoint =datePoint.replace('T',' ')
    remark =i['remark']
    print(datePoint,articleTitle,remark)

TEST

import re

import requests

url ='https://www.sstuku6.xyz/bb58/?shouye'

r =requests.get(url).text
# print(r)
html_url =re.findall(' <h2 class="entry-title"><a href="(.*?)" target="_blank" title=',r)
for i in html_url:
    htm_url ='https://www.sstuku6.xyz'+i
    rs =requests.get(htm_url).text
    # print(rs)
    img_url =re.findall('<img class="lazyload" data-src="(.*?)"',rs)
    img_title =re.findall('<h1 class="entry-title">(.*?)</h1>',rs)[0]
    for k in img_url:
        img =k
        print(img_title,k)





3

海报

正在生成.....

评论 (0)

取消