python爬蟲-梨視頻短視頻爬取(線程池)
示例代碼
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
|
import requests from lxml import etree import random from multiprocessing.dummy import Pool # 多進程要傳的方法,多進程pool.map()傳的第二個參數是一個迭代器對象 # 而傳的get_video方法也要有一個迭代器參數 def get_video(dic): headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.56' } video_data = requests.get(url = dic[ 'url' ] , headers = headers).content print (dic[ 'name' ] + '開始下載' ) # 有的文件名中包含空格,在并發執行時會報錯,這里用隨機數給文件起名了 #path = dic['name']+'.mp4'會報錯 path = "./lishipin/" + str ( int (random.random() * 100 )) + '.mp4' with open (path, 'wb' ) as fp: fp.write(video_data) print (dic[ 'name' ] + '下載成功' ) def main(): # web_url:梨視頻官網 web_url = 'https://www.pearvideo.com/category_5' headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.56' } # web_page_tex:官網頁面 web_page_text = requests.get(url = web_url,headers = headers).text tree = etree.HTML(web_page_text) # 解析梨視頻官網“生活欄”中的所有li標簽,遍歷li標簽,提取視頻的url li_list = tree.xpath( '//*[@id="listvideoListUl"]/li' ) rea_urls = [] for li in li_list: # video_name獲取視頻的名稱 video_name = li.xpath( './div/a/div[2]/text()' )[ 0 ] + '.mp4' # 加上'https://www.pearvideo.com/'得到完整的video_url video_url = 'https://www.pearvideo.com/' + li.xpath( "./div/a/@href" )[ 0 ] # 通過官網界面提取的url,并不是真正的url, # 因為MP4的視頻是動態加載出來的,所以通過ajax請求獲取視頻的真實網址 # 但是通過分析發現,ajax請求獲取的網址是一個偽網址,和真實網址有區別(cont...) ##真地址:https://video.pearvideo.com/mp4/third/20210208/cont-1719874-15690592-205957-ld.mp4 ## 1719874 ##偽地址:https://video.pearvideo.com/mp4/third/20210208/1612867876612-15690592-205957-ld.mp4 # 通過得到的video_url可以分析到 真假網址 不同的細節之處--countId # 通過ajax請求向video_url發起get請求,需要加countId和mrd參數 # 分析video_url得到countId,mrd是一個隨機小樹 countId = video_url.split( "/" )[ - 1 ].split( "_" )[ 1 ] mrd = random.random() # 加'Referer'參數,否則會顯示該視頻已下架了 headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.56' , 'Referer' : 'https://www.pearvideo.com/video_' + countId } ajax_url = 'https://www.pearvideo.com/videoStatus.jsp' # 利用ajax請求獲取偽地址 # https://www.pearvideo.com/videoStatus.jsp?contId=1719874&mrd=0.7759942025851074 params = { 'contId' : str (countId), 'mrd' : str (mrd) } # 通過ajax請求,發起get請求得到一個json串 ajax_json = requests.get(url = ajax_url,headers = headers,params = params).json() # 得到的是 假地址 fake_url = ajax_json[ 'videoInfo' ][ 'videos' ][ 'srcUrl' ] # 對假地址進行處理,并把剛才的countId組合起來 fake_url_list = fake_url.split( '/' ) end = fake_url_list.pop() #刪除不必要的字符串 end_list = end.split( "-" ) end_url = "" #end_url是一個結尾字符串 for i in range ( len (end_list) - 1 ): end_url = end_url + "-" + end_list[i + 1 ] # 真實的地址,先用假地址,然后組合countId rea_url = "" for element in fake_url_list: rea_url = rea_url + element + "/" rea_url = rea_url + "cont-" + str (countId) + end_url # print(rea_url) dic = { 'url' :rea_url, 'name' :video_name } rea_urls.append(dic) #print(rea_urls) pool = Pool( 4 ) pool. map (get_video,rea_urls) pool.close() pool.join() if __name__ = = '__main__' : main() |
知識點擴展:
Python爬蟲下載視頻(梨視頻)
梨視頻示例:Ctrl+Alt+L格式化代碼
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
|
import re import requests import hashlib import time # print(respose.status_code)# 響應的狀態碼 # print(respose.content) #返回字節信息 # print(respose.text) #返回文本內容 mainurl = "https://www.pearvideo.com/" videourl = "http://www.pearvideo.com/video_1499584" headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' , 'Accept-Encoding' : 'gzip, deflate, sdch' , 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' , 'Accept-Language' : 'zh-CN,zh;q=0.8' , } # 獲取視頻鏈接列表 def geturls(url): res = requests.get(url) urls = re.findall( 'class="vervideo-tbd".*?href="(.*?)" rel="external nofollow" ' ,res.text,re.S) urllist = [] for i in urls: prefix = 'https://www.pearvideo.com/' urllist.append(prefix + i) return urllist # 獲取視頻鏈接并下載(寫入到硬盤) def getvideo(url): res = requests.get(url,headers) mp4url = re.findall( 'srcUrl="(.*?\.mp4)"' ,res.text,re.S)[ 0 ] video = requests.get(mp4url) m = hashlib.md5() m.update(url.encode( 'utf-8' )) m.update( str (time.time()).encode( 'utf-8' )) filename = r '%s.mp4' % m.hexdigest() print (filename) with open ( "/home/tony/文檔/爬蟲視頻/%s.mp4" % filename, 'wb' ) as f: f.write(video.content) def main(): video_urllist = geturls(mainurl) for i in video_urllist: getvideo(i) if __name__ = = '__main__' : main() |
到此這篇關于python爬蟲線程池案例詳解(梨視頻短視頻爬取)的文章就介紹到這了,更多相關python爬蟲梨視頻短視頻爬取內容請搜索服務器之家以前的文章或繼續瀏覽下面的相關文章希望大家以后多多支持服務器之家!
原文鏈接:https://blog.csdn.net/m0_46500590/article/details/113775998