python爬虫爬取网页信息

时间: 2023-07-10 admin IT培训

python爬虫爬取网页信息

python爬虫爬取网页信息

爬虫流程:准备工作➡️爬取网页,获取数据(核心)➡️解析内容➡️保存数据

解析页面内容:使用beautifulsoup定位特定的标签位置,使用正则表达式找到具体内容

import导入一些库,做准备工作。

正则表达式(find开头),用来筛选信息(正则表达式用 re 库)。

以下例子爬取的内容是:电影详情链接,图片链接,影片中文名,影片外国名,评分,评价数,概况,相关信息。

���代码如下:

# -*- codeing = utf-8 -*-

from bs4 import BeautifulSoup  # 网页解析,获取数据import re  # 正则表达式,进行文字匹配import urllib.request, urllib.error  # 制定URL,获取网页数据import sqlite3  # 进行SQLite数据库操作,数据导出 (import pandas as pd)import xlwt  # 进行excel操作findLink = re.compile(r'<a href="(.*?)">')  # 创建正则表达式对象,标售规则‼️影片详情链接的规则findImgSrc = re.compile(r'<img.*src="(.*?)"', re.S)findTitle = re.compile(r'<span class="title">(.*)</span>')findRating = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')findJudge = re.compile(r'<span>(\d*)人评价</span>')findInq = re.compile(r'<span class="inq">(.*)</span>')findBd = re.compile(r'<p class="">(.*?)</p>', re.S)#���准备工作def main():baseurl = "="  #要爬取的网页链接,例:豆瓣网datalist = getData(baseurl)  #用来存储爬取的网页信息dbpath = "命名.db"              #当前目录新建数据库,存储进去saveData(datalist,dbpath)     # 保存数据# ���以下为具体爬取网页过程def getData(baseurl):   #调用 getData(baseurl)datalist = []   #用来存储爬取的网页信息for i in range(0, 10):  # 调用获取页面信息的函数,10次(电影评分Top250,每个页面只显示25个,所以需要访问页面10次)url = baseurl + str(i * 25)html = askURL(url)  # 保存获取到的网页源码#��� 逐一解析数据,需用到正则表达式soup = BeautifulSoup(html, "html.parser")for item in soup.find_all('div', class_="item"):  # 查找符合要求的字符串data = []  # 保存一部电影所有信息item = str(item)link = re.findall(findLink, item)[0]  # 通过正则表达式查找data.append(link)imgSrc = re.findall(findImgSrc, item)[0]data.append(imgSrc)titles = re.findall(findTitle, item)if (len(titles) == 2):ctitle = titles[0]data.append(ctitle)otitle = titles[1].replace("/", "")  #替换,以消除转义字符data.append(otitle)else:data.append(titles[0])data.append(' ')rating = re.findall(findRating, item)[0]data.append(rating)judgeNum = re.findall(findJudge, item)[0]data.append(judgeNum)inq = re.findall(findInq, item)if len(inq) != 0:inq = inq[0].replace("。", "")    #替换data.append(inq)else:data.append(" ")bd = re.findall(findBd, item)[0]bd = re.sub('<br(\s+)?/>(\s+)?', "", bd)bd = re.sub('/', "", bd)data.append(bd.strip())datalist.append(data)return datalist# ���得到指定一个URL的网页内容def askURL(url):  #调用askURL来请求网页head = {  # 模拟浏览器头部信息,向豆瓣服务器发送消息(字典)"User-Agent": "Mozilla / 5.0(Windows NT 10.0; Win64; x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 80.0.3987.122  Safari / 537.36"}# 用户代理,表示告诉豆瓣服务器,我们是什么类型的机器、浏览器(本质上是告诉浏览器,我们可以接收什么水平的文件内容)。如果不写的话,访问某些网站的时候会被认出来爬虫,显示错误。request = urllib.request.Request(url, headers=head)html = ""try:response = urllib.request.urlopen(request)html = response.read().decode("utf-8")  #读取网页的内容,设置编码为utf-8,防止乱码except urllib.error.URLError as e:if hasattr(e, "code"):print(e.code)if hasattr(e, "reason"):print(e.reason)return html# ���保存数据到数据库def saveData2DB(datalist,dbpath):init_db(dbpath)conn = sqlite3.connect(dbpath)cur = conn.cursor()for data in datalist:for index in range(len(data)):if index == 4 or index == 5:continuedata[index] = '"'+data[index]+'"'sql = '''insert into movie250(info_link,pic_link,cname,ename,score,rated,instroduction,info)values (%s)'''%",".join(data)# print(sql)     #输出查询语句,用来测试cur.execute(sql)conn.commit()cur.closeconn.close()def init_db(dbpath):sql = '''create table movie250(id integer  primary  key autoincrement,info_link text,pic_link text,cname varchar,ename varchar ,score numeric,rated numeric,instroduction text,info text)'''  #创建数据表conn = sqlite3.connect(dbpath)cursor = conn.cursor()cursor.execute(sql)conn.commit()conn.close()if __name__ == "__main__":  # 当程序执行时# 调用函数main()init_db("movietest.db")print("爬取完毕!")���使用Excel保存数据时:saveData(datalist,savepath) #数据的存储方式# ���保存数据到表格def saveData(datalist,savepath):print("save.......")book = xlwt.Workbook(encoding="utf-8",style_compression=0) #创建workbook对象sheet = book.add_sheet('豆瓣电影Top250', cell_overwrite_ok=True) #创建工作表col = ("电影详情链接","图片链接","影片中文名","影片外国名","评分","评价数","概况","相关信息")   #创建工作表,创列(在当前目录下创建)#然后把 dataList里的数据一条条存进去for i in range(0,8):sheet.write(0,i,col[i])  #列名for i in range(0,250):# print("第%d条" %(i+1))       #输出语句,用来测试data = datalist[i]for j in range(0,8):sheet.write(i+1,j,data[j])  #数据book.save(savepath)   #保存

���爬取东方财富数据:

参考链接:

()

爬取如图数据:

 完整的代码如下:

import re  #正则表达式,进行文字匹配import randomimport requestsimport pymysqlurls=r'=jQuery112307026751684189851_1634985344914&fid=f184&po=1&pz=50&pn=1&np=1&fltt=2&invt=2&fields=f2%2Cf3%2Cf12%2Cf13%2Cf14%2Cf62%2Cf184%2Cf225%2Cf165%2Cf263%2Cf109%2Cf175%2Cf264%2Cf160%2Cf100%2Cf124%2Cf265%2Cf1&ut=b2884a393a59ad64002292a3e90d46a5&fs=m%3A0%2Bt%3A6%2Bf%3A!2%2Cm%3A0%2Bt%3A13%2Bf%3A!2%2Cm%3A0%2Bt%3A80%2Bf%3A!2%2Cm%3A1%2Bt%3A2%2Bf%3A!2%2Cm%3A1%2Bt%3A23%2Bf%3A!2%2Cm%3A0%2Bt%3A7%2Bf%3A!2%2Cm%3A1%2Bt%3A3%2Bf%html = requests.get(urls)  # 保存获取到的网页源码html=html.textheaders = {'Accept':'*/*','Accept-Encoding':'gzip','Accept-Language':'zh-CN,zh;q=0.9','Connection':'keep-alive','Cookie':'widget_dz_id=54511; widget_dz_cityValues=,; timeerror=1; defaultCityID=54511; defaultCityName=%u5317%u4EAC; Hm_lvt_a3f2879f6b3620a363bec646b7a8bcdd=1516245199; Hm_lpvt_a3f2879f6b3620a363bec646b7a8bcdd=1516245199; addFavorite=clicked','User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'} #headers:装成豆瓣链接可识别的浏览器

 

 

def main():db=pymysql.connect(host='localhost',port=3306,user='root',password='123abc',database='test1',charset='utf8')cursor = db.cursor() #连接数据库sql='''create table df(id int primary key not null,daimas text not null,names text not null,zuixinjias float not null,zhangdiefus float not null,zhangdiees float not null,chengjiaoliangs float not null,chengjiaoes float not null,zhenfus float not null,zuigaos float not null,zuidis float not null,jinkais float not null,zuoshous float not null,liangbis float not null,huanshoulvs float not null,shiyinglvs float not null)'''    #创建表格cursor.execute(sql)db.commit()       #提交数据库操作db.close()        #关闭数据库for page in range(1,93):params = (('cb', 'jQuery112307026751684189851_1634985344914'),('pn', str(page)),('pz', '50'),('po', '1'),('np', '1'),('ut', 'b2884a393a59ad64002292a3e90d46a5'),('fltt', '2'),('invt', '2'),('fid', 'f3'),('fs', 'm:0+t:6+f:!2,m:0+t:13+f:!2,m:0+t:80+f:!2,m:1+t:2+f:!2,m:1+t:23+f:!2,m:0+t:7+f:!2,m:1+t:3+f:!2'),('fields', 'f2,f3,f12,f13,f14,f62,f184,f225,f165,f263,f109,f175,f264,f160,f100,f124,f265,f1'),)

 

   

info = []for url in urls:seconds = random.randint(3,6)response = requests.get(url, headers = headers).textdaimas = re.findall('"f12":(.*?),',response)names = re.findall('"f14":"(.*?)"',response)zuixinjias = re.findall('"f2":(.*?),',response)zhangdiefus = re.findall('"f3":(.*?),',response)zhangdiees = re.findall('"f4":(.*?),',response)chengjiaoliangs = re.findall('"f5":(.*?),',response)chengjiaoes = re.findall('"f6":(.*?),',response)zhenfus = re.findall('"f7":(.*?),',response)zuigaos = re.findall('"f15":(.*?),',response)zuidis = re.findall('"f16":(.*?),',response)jinkais = re.findall('"f17":(.*?),',response)zuoshous = re.findall('"f18":(.*?),',response)liangbis = re.findall('"f10":(.*?),',response)huanshoulvs = re.findall('"f8":(.*?),',response)shiyinglvs = re.findall('"f9":(.*?),',response)  #解析ls = []ls1 = []for i in range(20):ls1.append(names[i])ls1.append(names[i])ls1.append(zuixinjias[i])ls1.append(zhangdiefus[i])ls1.append(zhangdiees[i])ls1.append(chengjiaoliangs[i])ls1.append(chengjiaoes[i])ls1.append(zhenfus[i])ls1.append(zuigaos[i])ls1.append(zuidis[i])ls1.append(jinkais[i])ls1.append(zuoshous[i])ls1.append(liangbis[i])ls1.append(huanshoulvs[i])ls1.append(shiyinglvs[i])if len(ls1) == 15:ls.append(ls1)ls1 = [] #将内容写到之前创建好的数据库中query = "insert into df values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"for j in ls:cursor = db.cursor()db.ping(reconnect=True)cursor.execute(query, j)db.commit()db.close()if __name__ == '__main__':main()print("爬取成功!")

爬取完毕!