使用爬虫爬取洛谷题目数据
最近突然对爬虫感上兴趣,之后就花了1天的时间学完了中国大学MOOC网站上的Python网络爬虫与信息提取课程,之后成功爬取了洛谷的题目信息,算是达到最初的目标了。
契机
两天前洛谷终于AC200,突然想分析一下AC的这200个题的难度分布,结果手动查了198道题的难度,愣是找不到剩下两个题,最后没办法,突然想到曾经在MOOC网站上看到过爬虫教程,于是决定用爬虫把洛谷的所有题目数据都爬下来,之后从中找出那200题就比较简单了。
成果
爬取洛谷所有题目难度——从题目列表中爬取
import requests
import re
from bs4 import BeautifulSoup
def getHTMLText(url):
try:
r=requests.get(url, timeout = 30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def parsePage(ilt, html):
try:
soup=BeautifulSoup(html,"html.parser")
h = soup.find_all("div",attrs={'class':"am-g lg-table-row lg-table-bg0"})
for pr in h:
pr_name = pr.find_all('a')[1].string
pr_no = pr.find_all('a')[1].previous_sibling
pr_hard = pr.find_all('div',attrs={'class':"am-text-right"})[0].previous_sibling.string
pr_tag = []
for i in pr.find_all('span',attrs={'class':"lg-right am-text-right"})[0].find_all('span'):
pr_tag.append(i.string)
pr_ac = pr.find_all('div',attrs={'class':"am-progress-bar am-progress-bar-secondary"})[0].string.split('/')[0]
pr_try = pr.find_all('div',attrs={'class':"am-progress-bar am-progress-bar-secondary"})[0].string.split('/')[1]
pr_rate = pr.find_all('div',attrs={'class':"am-progress-bar am-progress-bar-secondary"})[0].attrs['style'].split(': ')[1]
ilt.append([pr_name,pr_no,pr_hard,pr_ac,pr_try,pr_rate,pr_tag])
except:
print("")
def printProblemsList(ilt,path):
f = open(path, 'a', encoding='utf-8')
tplt = "{:15}\t{:5}\t{:10}\t{:7}\t{:7}\t{:15}"
tpll = "{}\t{}\t{}\t{}\t{}\t{}\t{}"
tplr = "{}\t{}\t{}\t{}\t{}\t{}\t"
f.write(tpll.format("题目名称", "题号", "难度","通过数","提交数","通过率","标签")+'\n')
#print(tplt.format("题目名称", "题号", "难度","通过数","提交数","通过率"))
count = 0
for g in ilt:
count = count + 1
print(tplt.format(g[0], g[1], g[2],g[3],g[4],g[5]))
f.write(tplr.format(g[0], g[1], g[2],g[3],g[4],g[5]))
for o in g[6]:
f.write(o+',')
#print(o)
f.write("\n")
def main():
depth = 199#若只爬取洛谷题目,则depth=75,若爬取全部题目,depth应设为374,但是因为洛谷的bug,所以只能显示到200页,因此depth最大为199,多了也是浪费时间
#start_url='https://www.luogu.org/problemnew/lists?&page='#这个只有洛谷的题目
start_url='https://www.luogu.org/problemnew/lists?name=&orderitem=pid&tag=&content=0&type=1%7C13%7C14%7C15%7C16&page='#这个含有洛谷和其他remotejudgeOJ的题目
infoList = []
for i in range(depth):
try:
url = start_url + str(i+1)
html = getHTMLText(url)
parsePage(infoList, html)
print("已完成{:4}".format(i))
except:
continue
path = 'D:\luoguProblems.txt'
printProblemsList(infoList,path)
main()
爬取洛谷所有题目难度——从题目页面中爬取
import requests
import re
from bs4 import BeautifulSoup
def getHTMLText(url):
try:
r=requests.get(url, timeout = 30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def parsePage(ilt, html, p):
try:
soup=BeautifulSoup(html,"html.parser")
plt = p+3866
tlt = soup.body.contents[5].contents[3].contents[3].contents[1].contents[1].contents[1].contents[1].contents[9].contents[2].contents[0].string
ilt.append([plt, tlt])
except:
print("")
def printGoodsList(ilt):
tplt = "{:4}\t{:5}\t{:16}"
tpla = "{:4}\tP{:4}\t{:16}"
print(tplt.format("序号", "题号", "难度"))
count = 0
for g in ilt:
count = count + 1
print(tpla.format(count, g[0], g[1]))
def main():
depth = 3766
start_url='https://www.luogu.org/problemnew/show/P'
infoList = []
for i in range(depth):
try:
url = start_url + str(i+1000)
html = getHTMLText(url)
parsePage(infoList, html, i)
print("已完成{:4}".format(i))
except:
continue
printGoodsList(infoList)
main()
其他数据
其实题目页面上还有一些其他的数据我也做了研究,但是因为没有需求所以没有爬,现把其他数据也发出来:
从题目页面爬取:
kv={'user-agent':'Mozilla/5.0'}
og="1001"
url='https://www.luogu.org/problemnew/show/P'
r=requests.get(url+og,headers=kv)
r.text[6500:7000]
soup=BeautifulSoup(r.text,"html.parser")
soup.body.contents[5].contents//正文
soup.body.contents[5].contents[3].contents[1]//题目信息
soup.body.contents[5].contents[3].contents[3]//题目描述+题目信息
soup.body.contents[5].contents[3].contents[3].contents[1]//题目信息
soup.body.contents[5].contents[3].contents[3].contents[3]//题目描述
soup.body.contents[5].contents[3].contents[3].contents[1].contents[1]//题目信息(通过数等)
soup.body.contents[5].contents[3].contents[3].contents[1].contents[1].contents[1].contents[1]//无按钮
soup.body.contents[5].contents[3].contents[3].contents[1].contents[1].contents[1].contents[1].contents[1]//提交数
soup.body.contents[5].contents[3].contents[3].contents[1].contents[1].contents[1].contents[1].contents[3]//题目提供者
soup.body.contents[5].contents[3].contents[3].contents[1].contents[1].contents[1].contents[1].contents[5]//评测方式
soup.body.contents[5].contents[3].contents[3].contents[1].contents[1].contents[1].contents[1].contents[7]// 标签
soup.body.contents[5].contents[3].contents[3].contents[1].contents[1].contents[1].contents[1].contents[9]//难度
soup.body.contents[5].contents[3].contents[3].contents[1].contents[1].contents[1].contents[1].contents[11] //时空限制
soup.body.contents[5].contents[3].contents[3].contents[1].contents[1].contents[1].contents[1].contents[9].contents[2].contents[0].string //确切难度
从题目列表爬取(这部分全用了)
start_url='https://www.luogu.org/problemnew/lists?&page='
url = start_url + str(1)
r=requests.get(url)
html= r.text
soup= BeautifulSoup(html,"html.parser")
h = soup.find_all("div",attrs={'class':"am-g lg-table-row lg-table-bg0"})
pr=h[0]
pr.find_all('a')[1].string
#题目名称
pr.find_all('a')[1].previous_sibling
#题目序号
pr.find_all('div',attrs={'class':"am-text-right"})[0].previous_sibling.string
#题目难度
pr.find_all('div',attrs={'class':"am-progress-bar am-progress-bar-secondary"})[0].string
#AC/TRY