使用爬虫爬取洛谷题目数据

使用爬虫爬取洛谷题目数据

最近突然对爬虫感上兴趣,之后就花了1天的时间学完了中国大学MOOC网站上的Python网络爬虫与信息提取课程,之后成功爬取了洛谷的题目信息,算是达到最初的目标了。

契机

两天前洛谷终于AC200,突然想分析一下AC的这200个题的难度分布,结果手动查了198道题的难度,愣是找不到剩下两个题,最后没办法,突然想到曾经在MOOC网站上看到过爬虫教程,于是决定用爬虫把洛谷的所有题目数据都爬下来,之后从中找出那200题就比较简单了。

成果

爬取洛谷所有题目难度——从题目列表中爬取

import requests
import re
from bs4 import BeautifulSoup 

def getHTMLText(url):
    try:
        r=requests.get(url, timeout = 30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""

def parsePage(ilt, html):
    try:
        soup=BeautifulSoup(html,"html.parser")
        h = soup.find_all("div",attrs={'class':"am-g lg-table-row lg-table-bg0"})
        for pr in h:
            pr_name = pr.find_all('a')[1].string
            pr_no = pr.find_all('a')[1].previous_sibling
            pr_hard = pr.find_all('div',attrs={'class':"am-text-right"})[0].previous_sibling.string
            pr_tag = []
            for i in pr.find_all('span',attrs={'class':"lg-right am-text-right"})[0].find_all('span'):
                pr_tag.append(i.string)
            pr_ac = pr.find_all('div',attrs={'class':"am-progress-bar am-progress-bar-secondary"})[0].string.split('/')[0]
            pr_try = pr.find_all('div',attrs={'class':"am-progress-bar am-progress-bar-secondary"})[0].string.split('/')[1]
            pr_rate = pr.find_all('div',attrs={'class':"am-progress-bar am-progress-bar-secondary"})[0].attrs['style'].split(': ')[1]  
            ilt.append([pr_name,pr_no,pr_hard,pr_ac,pr_try,pr_rate,pr_tag])
    except:
        print("")

def printProblemsList(ilt,path):
    f = open(path, 'a', encoding='utf-8')
    tplt = "{:15}\t{:5}\t{:10}\t{:7}\t{:7}\t{:15}"
    tpll = "{}\t{}\t{}\t{}\t{}\t{}\t{}"
    tplr = "{}\t{}\t{}\t{}\t{}\t{}\t"
    f.write(tpll.format("题目名称", "题号", "难度","通过数","提交数","通过率","标签")+'\n')
    #print(tplt.format("题目名称", "题号", "难度","通过数","提交数","通过率"))
    count = 0
    for g in ilt:
        count = count + 1
        print(tplt.format(g[0], g[1], g[2],g[3],g[4],g[5]))
        f.write(tplr.format(g[0], g[1], g[2],g[3],g[4],g[5]))
        for o in g[6]:
            f.write(o+',')
            #print(o)
        f.write("\n")

def main():
    depth = 199#若只爬取洛谷题目,则depth=75,若爬取全部题目,depth应设为374,但是因为洛谷的bug,所以只能显示到200页,因此depth最大为199,多了也是浪费时间
    #start_url='https://www.luogu.org/problemnew/lists?&page='#这个只有洛谷的题目
    start_url='https://www.luogu.org/problemnew/lists?name=&orderitem=pid&tag=&content=0&type=1%7C13%7C14%7C15%7C16&page='#这个含有洛谷和其他remotejudgeOJ的题目
    infoList = []
    for i in range(depth):
        try:
            url = start_url + str(i+1)
            html = getHTMLText(url)
            parsePage(infoList, html)
            print("已完成{:4}".format(i))
        except:
            continue
    path = 'D:\luoguProblems.txt'
    printProblemsList(infoList,path)

main()

爬取洛谷所有题目难度——从题目页面中爬取

import requests
import re
from bs4 import BeautifulSoup 

def getHTMLText(url):
    try:
        r=requests.get(url, timeout = 30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""

def parsePage(ilt, html, p):
    try:
        soup=BeautifulSoup(html,"html.parser")
        plt = p+3866
        tlt = soup.body.contents[5].contents[3].contents[3].contents[1].contents[1].contents[1].contents[1].contents[9].contents[2].contents[0].string
        ilt.append([plt, tlt])
    except:
        print("")

def printGoodsList(ilt):
    tplt = "{:4}\t{:5}\t{:16}"
    tpla = "{:4}\tP{:4}\t{:16}"
    print(tplt.format("序号", "题号", "难度"))
    count = 0
    for g in ilt:
        count = count + 1
        print(tpla.format(count, g[0], g[1]))

def main():
    depth = 3766
    start_url='https://www.luogu.org/problemnew/show/P'
    infoList = []
    for i in range(depth):
        try:
            url = start_url + str(i+1000)
            html = getHTMLText(url)
            parsePage(infoList, html, i)
            print("已完成{:4}".format(i))
        except:
            continue
    printGoodsList(infoList)

main()

其他数据

其实题目页面上还有一些其他的数据我也做了研究,但是因为没有需求所以没有爬,现把其他数据也发出来:
从题目页面爬取:

kv={'user-agent':'Mozilla/5.0'}
og="1001"
url='https://www.luogu.org/problemnew/show/P'
r=requests.get(url+og,headers=kv)
r.text[6500:7000]

soup=BeautifulSoup(r.text,"html.parser")

soup.body.contents[5].contents//正文

soup.body.contents[5].contents[3].contents[1]//题目信息

soup.body.contents[5].contents[3].contents[3]//题目描述+题目信息

soup.body.contents[5].contents[3].contents[3].contents[1]//题目信息

soup.body.contents[5].contents[3].contents[3].contents[3]//题目描述

soup.body.contents[5].contents[3].contents[3].contents[1].contents[1]//题目信息(通过数等)

soup.body.contents[5].contents[3].contents[3].contents[1].contents[1].contents[1].contents[1]//无按钮

soup.body.contents[5].contents[3].contents[3].contents[1].contents[1].contents[1].contents[1].contents[1]//提交数

soup.body.contents[5].contents[3].contents[3].contents[1].contents[1].contents[1].contents[1].contents[3]//题目提供者

soup.body.contents[5].contents[3].contents[3].contents[1].contents[1].contents[1].contents[1].contents[5]//评测方式

soup.body.contents[5].contents[3].contents[3].contents[1].contents[1].contents[1].contents[1].contents[7]// 标签

soup.body.contents[5].contents[3].contents[3].contents[1].contents[1].contents[1].contents[1].contents[9]//难度

soup.body.contents[5].contents[3].contents[3].contents[1].contents[1].contents[1].contents[1].contents[11] //时空限制

soup.body.contents[5].contents[3].contents[3].contents[1].contents[1].contents[1].contents[1].contents[9].contents[2].contents[0].string //确切难度

从题目列表爬取(这部分全用了)

start_url='https://www.luogu.org/problemnew/lists?&page='

url = start_url + str(1)

r=requests.get(url)

html= r.text

soup= BeautifulSoup(html,"html.parser")

h = soup.find_all("div",attrs={'class':"am-g lg-table-row lg-table-bg0"})
    
pr=h[0]

pr.find_all('a')[1].string
#题目名称

pr.find_all('a')[1].previous_sibling
#题目序号

pr.find_all('div',attrs={'class':"am-text-right"})[0].previous_sibling.string
#题目难度

pr.find_all('div',attrs={'class':"am-progress-bar am-progress-bar-secondary"})[0].string
#AC/TRY