介绍
自行测试,代码大部分都是使用re正则表达式取数据。。。
代码一
import re
import requests
import os
"""
站点:ca789.com 自行测试
"""
pages =int(input('输入你要爬取的页数'))
types =input("输入你想要爬取的类型(toupai、meitui、oumei、katong)>>>:")
if not os.path.exists(f'D:/image/{types}//'):
os.mkdir((f'D:/image/{types}//'))
num =1
for page in range(1,pages):
url =f'https://lca789.com/pic/{types}/index_{page}.html'
# 图片类型可选(toupai、meitui、oumei、卡通)
print('\n===============正在爬取第',page,'页===============\n\n')
print('类型:',types)
data =requests.get(url)
link =re.findall('<dd><a href="(.*?)" target="_blank"><h3>',data.text)
for i in link:
html_url ='https://lca789.com/'+i
data1 =requests.get(html_url)
img_url=re.findall("<img src='(.*?)'><br><br>",data1.text)
title =re.findall("<title>(.*?)</title>",data1.text)[0]
for i in img_url:
i = re.sub("'><br><img src=|'> <br><img src='",'',i)
print(title,i)
data2 =requests.get(i).content
with open(f'D://image//{types}//{num}.jpg',mode='wb')as f:
f.write(data2)
num +=1
print('\n===============爬取结束===============\n')
代码二
import re
import requests
headers ={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54',
'cookie': 'yabs-sid=1394784121678716100; is_gdpr=0; is_gdpr_b=CJ6rGBDZqwE=; yandexuid=9617169011678716092; yuidss=9617169011678716092; i=BXtIPJOfVAf3+Y2wHz+oX9kKFt7x2/gt1yiZdaR+c0Q3GvRek5COFcuDb8QD5Sz31xEWBn6wPoEfZqXTHuqsfsGEda4=; yp=1678802891.yu.3878431071678716100; ymex=1681308491.oyu.3878431071678716100#1994076100.yrts.1678716100#1994076100.yrtsi.1678716100',
}
url ='https://xn---50ppiccom-4s2r687bes0e.www-50ppic.com/?fuli.one'
data =requests.get(url=url,headers=headers).text
data_url =re.findall('<img src="(.*?)" /></a></div><div class="item">',data)
len_num =len(data_url)
print('==============共',len_num,'张图片==============')
num =1
for i in data_url:
print('第',num,'图片下载成功',i)
title = i.split('/')[-1]
image =requests.get(url=i,headers=headers)
with open(f'D:/image/{title}''.jpg', mode='wb') as f:
f.write(image.content)
num +=1
代码三
import re
import requests
import os
pages =int(input('输入你想要爬取的页数>>>:'))
if not os.path.exists('D:/text/'):
os.mkdir(('D:/text/'))
for page in range(1,pages):
print('\n============正在爬取第',page,'页============\n')
url =f'https://fulizx2.cc/index.php/art/type/id/21/page/{page}.html'
data =requests.get(url)
html =re.findall('<a href="(.*?)" title="(.*?)" target="_blank">(.*?)</a>',data.text)
len_num =len(html)
print('此页面共',len_num,'篇小说')
for i in html:
title= i[1] #取标题
html_url=i[0] #取后缀链接
html_url='https://fulizx2.cc/'+html_url
# 网址拼接
print('《'+title+'》','\t',html_url)
# 输出打印 标题跟 url
data1 = requests.get(html_url)
book =re.findall('<book><p>(.*?)</p></book>',data1.text)
# re取小说内容
for i in book:
text =re.sub('</div>|<br>','\n',i).replace('? ? ',' ')
# 字符串替换
# print(text)
open(f"D://text//{title}.txt",mode='a',encoding='utf-8').write(text)
# 小说标题建文件名
# f.write('\n')
代码四
import re
import requests
for page in range(1,30):
url =f'https://m.woyaogexing.com/shouji/index_{page}.html'
print('\n=================正在爬取第',page,'页=================\n')
data =requests.get(url)
data.encoding =data.apparent_encoding
html_url =re.findall('<div class="m-img-wrap"><a href="(.*?)"',data.text)
for i in html_url:
url_html ='https://m.woyaogexing.com'+i
# print(url_html)
data1 =requests.get(url_html)
data1.encoding =data.apparent_encoding
img_url=re.findall('<a href="//(.*?)" class="swipebox">',data1.text)
for i in img_url:
title =re.findall('<h1 class="m-page-title">(.*?)</h1>',data1.text)[0]
image ='https://'+i
print(title,image)
import re
import requests
for page in range(1,30):
url =f'https://m.woyaogexing.com/touxiang/qinglv/index_{page}.html'
print('\n=================正在爬取第', page, '页=================\n')
data =requests.get(url)
data.encoding =data.apparent_encoding
html =re.findall('<a class="f-bd-4 f-elips" href="(.*?)" alt="(.*?)">',data.text)
for i in html:
title =i[1]
html_url ='https://m.woyaogexing.com/'+i[0]
data2 =requests.get(html_url)
data2.encoding =data2.apparent_encoding
img_url = re.findall('data-src="//(.*?)"/>',data2.text)
for i in img_url:
img_url = 'https://' + i
print(title,img_url)
俄乌战争局势
import re
import requests
url ='http://app.people.cn/api/v2/subjects/subjectTimelineList?articleId=771&size=20&pageToken=1&_t=1687915057&protocol=false'
r =requests.get(url).json()
json_data =r['item']
for i in json_data:
articleTitle =i['articleTitle']
datePoint =i['datePoint']
datePoint =re.sub('\\+0800','',datePoint)
datePoint =datePoint.replace('T',' ')
remark =i['remark']
print(datePoint,articleTitle,remark)
TEST
import re
import requests
url ='https://www.sstuku6.xyz/bb58/?shouye'
r =requests.get(url).text
# print(r)
html_url =re.findall(' <h2 class="entry-title"><a href="(.*?)" target="_blank" title=',r)
for i in html_url:
htm_url ='https://www.sstuku6.xyz'+i
rs =requests.get(htm_url).text
# print(rs)
img_url =re.findall('<img class="lazyload" data-src="(.*?)"',rs)
img_title =re.findall('<h1 class="entry-title">(.*?)</h1>',rs)[0]
for k in img_url:
img =k
print(img_title,k)
评论 (0)