import json import os import requests from bs4 import BeautifulSoup # 数据采集基础知识豆瓣读书T250的数据的获取 def getHTML(n): # 获取每一张含有25本书的网页n为页码-1 url https://book.douban.com/top250 header { user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36} r requests.get(url, headersheader, params{start: n * 25}) return r.text # 用beautifulsoup4库解析网页用lxml解析器解析器也要安装 def getlistData(html): soup BeautifulSoup(html, lxml) books soup.select(tr) # print(books) booklist [] for book in books: bookdic {} tds book.select(td) # 接下来的内容真的比较麻烦~耐心点吧~ bookdic[书名] tds[1].div.a.text.strip().split(\n)[0] bookdic[基本信息] tds[1].p.text booklist.append(bookdic) return booklist allbooks [] for i in range(10): html getHTML(i) page getlistData(html) allbooks.append(page) def savejson(data,path,filename): jdatajson.dump(data,indent2,ensure_asciiFalse) if not os.path.exists(path): os.makedirs(path) with open(pathfilename,w,encodingutf-8)as f: f.write(jdata) savejson(allbooks,data/,douban250.json) # 安装request beautifulsoup4 lxml
3-2作业
import json import os import requests from bs4 import BeautifulSoup # 数据采集基础知识豆瓣读书T250的数据的获取 def getHTML(n): # 获取每一张含有25本书的网页n为页码-1 url https://book.douban.com/top250 header { user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36} r requests.get(url, headersheader, params{start: n * 25}) return r.text # 用beautifulsoup4库解析网页用lxml解析器解析器也要安装 def getlistData(html): soup BeautifulSoup(html, lxml) books soup.select(tr) # print(books) booklist [] for book in books: bookdic {} tds book.select(td) # 接下来的内容真的比较麻烦~耐心点吧~ bookdic[书名] tds[1].div.a.text.strip().split(\n)[0] bookdic[基本信息] tds[1].p.text booklist.append(bookdic) return booklist allbooks [] for i in range(10): html getHTML(i) page getlistData(html) allbooks.append(page) def savejson(data,path,filename): jdatajson.dump(data,indent2,ensure_asciiFalse) if not os.path.exists(path): os.makedirs(path) with open(pathfilename,w,encodingutf-8)as f: f.write(jdata) savejson(allbooks,data/,douban250.json) # 安装request beautifulsoup4 lxml