import requests
import pandas as pd
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) \
AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/70.0.3538.25 \
Safari/537.36 Core/1.70.3823.400 QQBrowser/10.7.4307.400'}
params ={"format": "json","page":1, "ps":20, "class": "all"}
r=requests.get("https://www.gwdang.com/api/promotion_discount?", \
params=params, headers=headers)
x=r.json()
data = x['list']
site_name=[]; title=[]; price_info=[]; org_price=[]; update_time=[]; keyword=[]
for i in range(len(data)):
site_name.append(data[i]['site_name'])
title.append(data[i]['title'])
price_info.append(data[i]['price_info'])
org_price.append(data[i]['org_price'])
update_time.append(data[i]['update_time'])
keyword.append(data[i]['keyword'])
ex_data = pd.DataFrame()
ex_data['site_name']=site_name
ex_data['title']=title
ex_data['price_info']=price_info
ex_data['org_price']=org_price
ex_data['update_time']=update_time
ex_data['keyword']=keyword
ex_data.to_excel(excel_writer='gwdang.xlsx')
import requests
import pandas as pd
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) \
AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/70.0.3538.25 \
Safari/537.36 Core/1.70.3823.400 QQBrowser/10.7.4307.400'}
data=[]
for page in range(1,6):
params ={"format": "json", "page":page, "ps":page*20, "class": "all"}
r=requests.get("https://www.gwdang.com/api/promotion_discount?", \
params=params, headers=headers)
x = r.json()
data = data + x['list']
import requests
from bs4 import BeautifulSoup
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) \
AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/70.0.3538.25 \
Safari/537.36 Core/1.70.3823.400 QQBrowser/10.7.4307.400'}
search ="腿"
r=requests.get(f"https://dxy.com/search/articles/{search}", \
headers=headers)
print(r.status_code)
print(r.url)
soup=BeautifulSoup(r.content, 'lxml')
#print(soup.prettify())
print(soup.title.string)
print(soup.title)
print(soup.head)
Tag (标签),Attribute (属性), 节点
print(soup.div) # div标签节点
print(soup.div.attrs) # div标签节点所有属性
print(soup.div['id']) # id属性
print(soup.h2) # h2是标题标签节点 h1, h2,...
print(soup.h2.string) # h2是标题标签包裹的文本
content_list = soup.find_all(attrs={"class":"article-title"})
# print(content_list)
article_url = []
article_title = []
for i in range(len(content_list)):
article_url.append(content_list[i]["href"])
article_title.append(content_list[i].contents[0].string)
print(article_url)
['https://dxy.com/article/9776',
'https://dxy.com/article/22988',
'https://dxy.com/article/41103',
'https://dxy.com/article/39274',
'https://dxy.com/article/29950',
'https://dxy.com/article/41101',
'https://dxy.com/article/26647',
'https://dxy.com/article/40272',
'https://dxy.com/article/43280',
'https://dxy.com/article/35889']
Save to .docx files
import os
import re
import pypandoc
exist=os.path.exists('dingxiang_search_leg')
if not exist:
os.mkdir('dingxiang_search_leg')
os.chdir('dingxiang_search_leg')
for j in range(len(article_url)):
url = article_url[j]
title = re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])",
"",article_title[j])+'.docx'
output = pypandoc.convert_file(url,'docx','html',outputfile=title)
# \u4e00-\u9fa5 汉字的unicode范围
# \u0030-\u0039 数字的unicode范围
# \u0041-\u005a 大写字母unicode范围
# \u0061-\u007a 小写字母unicode范围
# \uAC00-\uD7AF 韩文的unicode范围
# \u3040-\u31FF 日文的unicode范围
num_page = 1
article_url = []
article_title = []
for page in range(num_page):
params_page={"page_index": str(page)}
r_all = requests.get(url, params=params_page, headers=headers)
soup_all = BeautifulSoup(r_all.content, 'lxml')
content_list = soup_all.find_all(attrs={"class":"article-title"})
for i in range(len(content_list)):
article_url.append(content_list[i]["href"])
article_title.append(content_list[i].contents[0].string)
selenium 模块让Python 直接控制浏览器,实际点击链接,填写登录信息,几乎 就像是有一个人类用户在与页面交互。与Requests 和Beautiful Soup 相比,Selenium 允许你用高级得多的方式与网页交互。 Edge Driver下载,选择对应的版本,并添加到PythonPATH
from selenium import webdriver
browser = webdriver.Edge()
browser.get('http://www.douban.com')
WebDriver 对象有好几种方法,用于在页面中寻找元素。它们被分成find_element和 find_elements方法。find_element方法返回一个WebElement 对象,代表页面中匹配查询的第一个元素。 find_elements方法返回WebElement对象的列表,包含 页面中所有匹配的元素。Selenium定位策略
find_element和find_elements方法返回的WebElement 对象有一个click()方法,模拟鼠标在该元素上点击。这个方法可以用于链接跳转,选择单选按钮,点击提交按钮,或者触发该元素被鼠标点击时发生的任何事情。例如,在交互式环境中输入以下代码:
from selenium import webdriver
from selenium.webdriver.common.by import By
browser = webdriver.Edge()
browser.get('http://www.douban.com')
linkElem = browser.find_element(By.LINK_TEXT,'豆瓣读书')
linkElem.click()
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import re
import os
from selenium.webdriver.common.by import By
# 模拟浏览器搜索,获得html文件(更简单的方法)
browser = webdriver.Edge()
browser.get("https://book.douban.com/")
input=browser.find_element(By.ID,"inp-query")
input.send_keys("Python")
button=browser.find_element(By.CLASS_NAME, "inp-btn")
button.click()
page = browser.page_source
browser.close()
# 解析html文件,获得书籍链接和名称
soup=BeautifulSoup(page, 'lxml')
list = soup.find_all(attrs={"class":"title-text"})
url_list=[]
title_list=[]
for i in list:
url_list.append(i["href"])
title_list.append(i.string)
# 创建文件夹
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) \
AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/70.0.3538.25 \
Safari/537.36 Core/1.70.3823.400 QQBrowser/10.7.4307.400'}
exist=os.path.exists('douban_search')
if not exist:
os.mkdir('douban_search')
os.chdir('douban_search')
# 保存书籍封面
i=-1; fig=0
while not fig:
i=i+1
r=requests.get(url_list[i], headers=headers)
book=BeautifulSoup(r.content, 'lxml')
fig=book.find_all(attrs={"title":"点击看大图"})
position_1=fig[0]['src'].find("s/")
position_2=fig[0]['src'].find("/public")
src = fig[0]['src'][:position_1]+"l"+fig[0]['src'][position_2:]
fig_r=requests.get(src, headers=headers)
fig_title = re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])",
"",title_list[i])+".jpg"
with open(fig_title, "wb") as f:
f.write(fig_r.content)
# 保存书籍简介
intro = book.find_all(attrs={"class":"intro"})
filename = re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])",
"",title_list[i])+".txt"
for j in intro:
with open(filename, 'a',encoding="utf-8") as file_object:
file_object.write(j.get_text())