import requests
import pandas as pd
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) \
AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/70.0.3538.25 \
Safari/537.36 Core/1.70.3823.400 QQBrowser/10.7.4307.400'}
params ={"p":1,
"past_num":20}
r=requests.get("https://www.smzdm.com/homepage/json_more", \
params=params, headers=headers)
x=r.json()
data= x['data']
title=[]; price=[]; date=[]; category=[]; atype=[]; mall=[]
for i in range(len(data)):
if 'article_price' in data[i]:
atype.append(data[i]['article_type'])
title.append(data[i]['article_title'])
price.append(data[i]['article_price'])
date.append(data[i]['article_date'])
category.append(data[i]['top_category'])
mall.append(data[i]['article_mall'])
ex_data=pd.DataFrame()
ex_data['article_title']=title
ex_data['article_price']=price
ex_data['article_date']=date
ex_data['top_category']=category
ex_data['article_mall']=mall
print(atype)
print(title)
ex_data.to_excel(excel_writer='smzdm.xlsx', encoding='utf-8')
import requests
import pandas as pd
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) \
AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/70.0.3538.25 \
Safari/537.36 Core/1.70.3823.400 QQBrowser/10.7.4307.400'}
data=[]
for page in range(1,6):
params ={"p":page, "past_num":page*20}
r=requests.get("https://www.smzdm.com/homepage/json_more", \
params=params, headers=headers)
x = r.json()
data = data + x['data']
import requests
from bs4 import BeautifulSoup
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) \
AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/70.0.3538.25 \
Safari/537.36 Core/1.70.3823.400 QQBrowser/10.7.4307.400'}
search ="腿"
r=requests.get(f"https://dxy.com/search/articles/{search}", \
headers=headers)
print(r.status_code)
print(r.url)
soup=BeautifulSoup(r.content, 'lxml')
#print(soup.prettify())
print(soup.title.string)
print(soup.title)
print(soup.head)
Tag (标签),Attribute (属性), 节点
print(soup.div) # div标签节点
print(soup.div.attrs) # div标签节点所有属性
print(soup.div['id']) # id属性
print(soup.h2) # h2是标题标签节点 h1, h2,...
print(soup.h2.string) # h2是标题标签包裹的文本
content_list = soup.find_all(attrs={"class":"article-title"})
# print(content_list)
article_url = []
article_title = []
for i in range(len(content_list)):
article_url.append(content_list[i]["href"])
article_title.append(content_list[i].contents[0].string)
print(article_url)
['https://dxy.com/article/9776',
'https://dxy.com/article/22988',
'https://dxy.com/article/41103',
'https://dxy.com/article/39274',
'https://dxy.com/article/29950',
'https://dxy.com/article/41101',
'https://dxy.com/article/26647',
'https://dxy.com/article/40272',
'https://dxy.com/article/43280',
'https://dxy.com/article/35889']
Save to .docx files
import os
import re
import pypandoc
exist=os.path.exists('dingxiang_search_leg')
if not exist:
os.mkdir('dingxiang_search_leg')
os.chdir('dingxiang_search_leg')
for j in range(len(article_url)):
url = article_url[j]
title = re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])",
"",article_title[j])+'.docx'
output = pypandoc.convert_file(url,'docx','html',outputfile=title)
# \u4e00-\u9fa5 汉字的unicode范围
# \u0030-\u0039 数字的unicode范围
# \u0041-\u005a 大写字母unicode范围
# \u0061-\u007a 小写字母unicode范围
# \uAC00-\uD7AF 韩文的unicode范围
# \u3040-\u31FF 日文的unicode范围
num_page = 1
article_url = []
article_title = []
for page in range(num_page):
params_page={"page_index": str(page)}
r_all = requests.get(url, params=params_page, headers=headers)
soup_all = BeautifulSoup(r_all.content, 'lxml')
content_list = soup_all.find_all(attrs={"class":"article-title"})
for i in range(len(content_list)):
article_url.append(content_list[i]["href"])
article_title.append(content_list[i].contents[0].string)
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import re
import os
# 模拟浏览器搜索,获得html文件(更简单的方法)
browser = webdriver.Chrome()
browser.get("https://book.douban.com/")
input=browser.find_element_by_id("inp-query")
input.send_keys("Python")
button=browser.find_element_by_class_name("inp-btn")
button.click()
page = browser.page_source
browser.close()
# 解析html文件,获得书籍链接和名称
soup=BeautifulSoup(page, 'lxml')
list = soup.find_all(attrs={"class":"title-text"})
url_list=[]
title_list=[]
for i in list:
url_list.append(i["href"])
title_list.append(i.string)
# 创建文件夹
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) \
AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/70.0.3538.25 \
Safari/537.36 Core/1.70.3823.400 QQBrowser/10.7.4307.400'}
exist=os.path.exists('douban_search')
if not exist:
os.mkdir('douban_search')
os.chdir('douban_search')
# 保存书籍封面
i=-1; fig=0
while not fig:
i=i+1
r=requests.get(url_list[i], headers=headers)
book=BeautifulSoup(r.content, 'lxml')
fig=book.find_all(attrs={"title":"点击看大图"})
position_1=fig[0]['src'].find("s/")
position_2=fig[0]['src'].find("/public")
src = fig[0]['src'][:position_1]+"l"+fig[0]['src'][position_2:]
fig_r=requests.get(src, headers=headers)
fig_title = re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])",
"",title_list[i])+".jpg"
with open(fig_title, "wb") as f:
f.write(fig_r.content)
# 保存书籍简介
intro = book.find_all(attrs={"class":"intro"})
filename = re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])",
"",title_list[i])+".txt"
for j in intro:
with open(filename, 'a',encoding="utf-8") as file_object:
file_object.write(j.get_text())
# 保存书籍目录
position_3=url_list[i].find("subject/")
position_4=url_list[i].find("/",position_3+1)
position_5=url_list[i].find("/",position_4+1)
bookid="dir_"+url_list[i][position_4+1:position_5]+"_full"
dir=book.find_all(attrs={"id":bookid})
if dir:
with open(filename, 'a',encoding="utf-8") as file_object:
file_object.write(dir[0].get_text())
os.chdir('..')
import time
import random
for i in range(len(url_list)):
r=requests.get(url_list[i], headers=headers)
time.sleep(random.random()*3)
book=BeautifulSoup(r.content, 'lxml')
fig=book.find_all(attrs={"title":"点击看大图"})
# 这段从属于上面的For循环,需要整体缩进一次
if fig:
position_1=fig[0]['src'].find("s/")
position_2=fig[0]['src'].find("/public")
src = fig[0]['src'][:position_1]+"l"+fig[0]['src'][position_2:]
fig_r=requests.get(src, headers=headers)
#time.sleep(random.random()*3)
fig_title = re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])",
"",title_list[i])+".jpg"
with open(fig_title, "wb") as f:
f.write(fig_r.content)
intro = book.find_all(attrs={"class":"intro"})
filename = re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])",
"",title_list[i])+".txt"
# 这段从属于上面的if条件判断,需要整体缩进两次
for j in intro:
with open(filename, 'a',encoding="utf-8") as file_object:
file_object.write(j.get_text())
position_3=url_list[i].find("subject/")
position_4=url_list[i].find("/",position_3+1)
position_5=url_list[i].find("/",position_4+1)
bookid="dir_"+url_list[i][position_4+1:position_5]+"_full"
dir=book.find_all(attrs={"id":bookid})
if dir:
with open(filename, 'a',encoding="utf-8") as file_object:
file_object.write(dir[0].get_text())
os.chdir('..')