怎么用Python爬取抖音小视频? 资深程序员都这样爬取的(附源码)
简介
抖音,是一款可以拍短视频的音乐创意短视频社交软件,该软件于2016年9月上线,是一个专注年轻人的15秒音乐短视频社区。用户可以通过这款软件选择歌曲,拍摄15秒的音乐短视频,形成自己的作品。此APP已在Android各大应用商店和APP Store均有上线。
今天咱们就用Python爬取抖音视频
准备:
环境:Python3.6+Windows
IDE:你开行就好,喜欢用哪个就用哪个
模块:
1 from splinter.driver.webdriver.chrome import Options, Chrome 2 from splinter.browser import Browser 3 from contextlib import closing 4 import requests, json, time, re, os, sys, time 5 from bs4 import BeautifulSoup
获得视频播放地址
-
查询的用户ID
-
视频名字列表
-
视频链接列表
-
用户昵称
1 def get_video_urls(self, user_id): 2 3 + video_names = [] 4 + video_urls = [] 5 + unique_id = \'\' 6 + while unique_id != user_id: 7 + search_url = \'https://api.amemv.com/aweme/v1/discover/search/?cursor=0&keyword=%s&count=10&type=1&retry_type=no_retry&iid=17900846586&device_id=34692364855&ac=wifi&channel=xiaomi&aid=1128&app_name=aweme&version_code=162&version_name=1.6.2&device_platform=android&ssmix=a&device_type=MI+5&device_brand=Xiaomi&os_api=24&os_version=7.0&uuid=861945034132187&openudid=dc451556fc0eeadb&manifest_version_code=162&resolution=1080*1920&dpi=480&update_version_code=1622\' % user_id 8 + req = requests.get(url = search_url, verify = False) 9 + html = json.loads(req.text) 10 + aweme_count = html[\'user_list\'][0][\'user_info\'][\'aweme_count\'] 11 + uid = html[\'user_list\'][0][\'user_info\'][\'uid\'] 12 + nickname = html[\'user_list\'][0][\'user_info\'][\'nickname\'] 13 + unique_id = html[\'user_list\'][0][\'user_info\'][\'unique_id\'] 14 + user_url = \'https://www.douyin.com/aweme/v1/aweme/post/?user_id=%s&max_cursor=0&count=%s\' % (uid, aweme_count) 15 + req = requests.get(url = user_url, verify = False) 16 + html = json.loads(req.text) 17 + i = 1 18 + for each in html[\'aweme_list\']: 19 + share_desc = each[\'share_info\'][\'share_desc\'] 20 + if \'抖音-原创音乐短视频社区\' == share_desc: 21 + video_names.append(str(i) + \'.mp4\') 22 + i += 1 23 + else: 24 + video_names.append(share_desc + \'.mp4\') 25 + video_urls.append(each[\'share_info\'][\'share_url\']) 26 + 27 + return video_names, video_urls, nickname
获得带水印的视频播放地址
-
video_url:带水印的视频播放地址
-
download_url: 带水印的视频下载地址
1 def get_download_url(self, video_url): 2 3 + req = requests.get(url = video_url, verify = False) 4 + bf = BeautifulSoup(req.text, \'lxml\') 5 + script = bf.find_all(\'script\')[-1] 6 + video_url_js = re.findall(\'var data = \[(.+)\];\', str(script))[0] 7 + video_html = json.loads(video_url_js) 8 + download_url = video_html[\'video\'][\'play_addr\'][\'url_list\'][0] 9 + return download_url
视频下载
-
video_url: 带水印的视频地址
-
video_name: 视频名
-
watermark_flag: 是否下载不带水印的视频
1 def video_downloader(self, video_url, video_name, watermark_flag=True): 2 + """ 3 + 视频下载 4 + Parameters: 5 + video_url: 带水印的视频地址 6 + video_name: 视频名 7 + watermark_flag: 是否下载不带水印的视频 8 + Returns: 9 + 无 10 + """ 11 + size = 0 12 + if watermark_flag == True: 13 + video_url = self.remove_watermark(video_url) 14 + else: 15 + video_url = self.get_download_url(video_url) 16 + with closing(requests.get(video_url, stream=True, verify = False)) as response: 17 + chunk_size = 1024 18 + content_size = int(response.headers[\'content-length\']) 19 + if response.status_code == 200: 20 + sys.stdout.write(\' [文件大小]:%0.2f MB\n\' % (content_size / chunk_size / 1024)) 21 + 22 + with open(video_name, "wb") as file: 23 + for data in response.iter_content(chunk_size = chunk_size): 24 + file.write(data) 25 + size += len(data) 26 + file.flush() 27 + 28 + sys.stdout.write(\' [下载进度]:%.2f%%\' % float(size / content_size * 100) + \'\r\') 29 + sys.stdout.flush()
获得无水印的视频播放地址
1 def remove_watermark(self, video_url): 2 + """ 3 + 获得无水印的视频播放地址 4 + Parameters: 5 + video_url: 带水印的视频地址 6 + Returns: 7 + 无水印的视频下载地址 8 + """ 9 + self.driver.visit(\'http://douyin.iiilab.com/\') 10 + self.driver.find_by_tag(\'input\').fill(video_url) 11 + self.driver.find_by_xpath(\'//button[@class="btn btn-default"]\').click() 12 + html = self.driver.find_by_xpath(\'//div[@class="thumbnail"]/div/p\')[0].html 13 + bf = BeautifulSoup(html, \'lxml\') 14 + return bf.find(\'a\').get(\'href\')
下载视频
1 def run(self): 2 + """ 3 + 运行函数 4 + Parameters: 5 + None 6 + Returns: 7 + None 8 + """ 9 + self.hello() 10 + user_id = input(\'请输入ID(例如40103580):\') 11 + video_names, video_urls, nickname = self.get_video_urls(user_id) 12 + if nickname not in os.listdir(): 13 + os.mkdir(nickname) 14 + print(\'视频下载中:共有%d个作品!\n\' % len(video_urls)) 15 + for num in range(len(video_urls)): 16 + print(\' 解析第%d个视频链接 [%s] 中,请稍后!\n\' % (num+1, video_urls[num])) 17 + if \'\\\' in video_names[num]: 18 + video_name = video_names[num].replace(\'\\\', \'\') 19 + elif \'/\' in video_names[num]: 20 + video_name = video_names[num].replace(\'/\', \'\') 21 + else: 22 + video_name = video_names[num] 23 + self.video_downloader(video_urls[num], os.path.join(nickname, video_name)) 24 + print(\'\n\') 25 + 26 + print(\'下载完成!\')
全部代码
1 +# -*- coding:utf-8 -*- 2 3 +Python学习交流群:125240963 4 +Python学习交流群:125240963 5 +Python学习交流群:125240963 6 7 +from splinter.driver.webdriver.chrome import Options, Chrome 8 +from splinter.browser import Browser 9 +from contextlib import closing 10 +import requests, json, time, re, os, sys, time 11 +from bs4 import BeautifulSoup 12 + 13 class DouYin(object): 14 def __init__(self, width = 500, height = 300): 15 + """ 16 + 抖音App视频下载 17 + """ 18 + # 无头浏览器 19 + chrome_options = Options() 20 + chrome_options.add_argument(\'user-agent="Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"\') 21 + self.driver = Browser(driver_name=\'chrome\', executable_path=\'D:/chromedriver\', options=chrome_options, headless=True) 22 + 23 def get_video_urls(self, user_id): 24 + """ 25 + 获得视频播放地址 26 + Parameters: 27 + user_id:查询的用户ID 28 + Returns: 29 + video_names: 视频名字列表 30 + video_urls: 视频链接列表 31 + nickname: 用户昵称 32 + """ 33 + video_names = [] 34 + video_urls = [] 35 + unique_id = \'\' 36 + while unique_id != user_id: 37 + search_url = \'https://api.amemv.com/aweme/v1/discover/search/?cursor=0&keyword=%s&count=10&type=1&retry_type=no_retry&iid=17900846586&device_id=34692364855&ac=wifi&channel=xiaomi&aid=1128&app_name=aweme&version_code=162&version_name=1.6.2&device_platform=android&ssmix=a&device_type=MI+5&device_brand=Xiaomi&os_api=24&os_version=7.0&uuid=861945034132187&openudid=dc451556fc0eeadb&manifest_version_code=162&resolution=1080*1920&dpi=480&update_version_code=1622\' % user_id 38 + req = requests.get(url = search_url, verify = False) 39 + html = json.loads(req.text) 40 + aweme_count = html[\'user_list\'][0][\'user_info\'][\'aweme_count\'] 41 + uid = html[\'user_list\'][0][\'user_info\'][\'uid\'] 42 + nickname = html[\'user_list\'][0][\'user_info\'][\'nickname\'] 43 + unique_id = html[\'user_list\'][0][\'user_info\'][\'unique_id\'] 44 + user_url = \'https://www.douyin.com/aweme/v1/aweme/post/?user_id=%s&max_cursor=0&count=%s\' % (uid, aweme_count) 45 + req = requests.get(url = user_url, verify = False) 46 + html = json.loads(req.text) 47 + i = 1 48 + for each in html[\'aweme_list\']: 49 + share_desc = each[\'share_info\'][\'share_desc\'] 50 + if \'抖音-原创音乐短视频社区\' == share_desc: 51 + video_names.append(str(i) + \'.mp4\') 52 + i += 1 53 + else: 54 + video_names.append(share_desc + \'.mp4\') 55 + video_urls.append(each[\'share_info\'][\'share_url\']) 56 + 57 + return video_names, video_urls, nickname 58 + 59 def get_download_url(self, video_url): 60 + """ 61 + 获得带水印的视频播放地址 62 + Parameters: 63 + video_url:带水印的视频播放地址 64 + Returns: 65 + download_url: 带水印的视频下载地址 66 + """ 67 + req = requests.get(url = video_url, verify = False) 68 + bf = BeautifulSoup(req.text, \'lxml\') 69 + script = bf.find_all(\'script\')[-1] 70 + video_url_js = re.findall(\'var data = \[(.+)\];\', str(script))[0] 71 + video_html = json.loads(video_url_js) 72 + download_url = video_html[\'video\'][\'play_addr\'][\'url_list\'][0] 73 + return download_url 74 + 75 def video_downloader(self, video_url, video_name, watermark_flag=True): 76 + """ 77 + 视频下载 78 + Parameters: 79 + video_url: 带水印的视频地址 80 + video_name: 视频名 81 + watermark_flag: 是否下载不带水印的视频 82 + Returns: 83 + 无 84 + """ 85 + size = 0 86 + if watermark_flag == True: 87 + video_url = self.remove_watermark(video_url) 88 + else: 89 + video_url = self.get_download_url(video_url) 90 + with closing(requests.get(video_url, stream=True, verify = False)) as response: 91 + chunk_size = 1024 92 + content_size = int(response.headers[\'content-length\']) 93 + if response.status_code == 200: 94 + sys.stdout.write(\' [文件大小]:%0.2f MB\n\' % (content_size / chunk_size / 1024)) 95 + 96 + with open(video_name, "wb") as file: 97 + for data in response.iter_content(chunk_size = chunk_size): 98 + file.write(data) 99 + size += len(data) 100 + file.flush() 101 + 102 + sys.stdout.write(\' [下载进度]:%.2f%%\' % float(size / content_size * 100) + \'\r\') 103 + sys.stdout.flush() 104 + 105 + 106 def remove_watermark(self, video_url): 107 + """ 108 + 获得无水印的视频播放地址 109 + Parameters: 110 + video_url: 带水印的视频地址 111 + Returns: 112 + 无水印的视频下载地址 113 + """ 114 + self.driver.visit(\'http://douyin.iiilab.com/\') 115 + self.driver.find_by_tag(\'input\').fill(video_url) 116 + self.driver.find_by_xpath(\'//button[@class="btn btn-default"]\').click() 117 + html = self.driver.find_by_xpath(\'//div[@class="thumbnail"]/div/p\')[0].html 118 + bf = BeautifulSoup(html, \'lxml\') 119 + return bf.find(\'a\').get(\'href\') 120 + 121 def run(self): 122 + """ 123 + 运行函数 124 + Parameters: 125 + None 126 + Returns: 127 + None 128 + """ 129 + self.hello() 130 + user_id = input(\'请输入ID(例如40103580):\') 131 + video_names, video_urls, nickname = self.get_video_urls(user_id) 132 + if nickname not in os.listdir(): 133 + os.mkdir(nickname) 134 + print(\'视频下载中:共有%d个作品!\n\' % len(video_urls)) 135 + for num in range(len(video_urls)): 136 + print(\' 解析第%d个视频链接 [%s] 中,请稍后!\n\' % (num+1, video_urls[num])) 137 + if \'\\\' in video_names[num]: 138 + video_name = video_names[num].replace(\'\\\', \'\') 139 + elif \'/\' in video_names[num]: 140 + video_name = video_names[num].replace(\'/\', \'\') 141 + else: 142 + video_name = video_names[num] 143 + self.video_downloader(video_urls[num], os.path.join(nickname, video_name)) 144 + print(\'\n\') 145 + 146 + print(\'下载完成!\') 147 + 148 def hello(self): 149 + """ 150 + 打印欢迎界面 151 + Parameters: 152 + None 153 + Returns: 154 + None 155 + """ 156 + print(\'*\' * 100) 157 + print(\'\t\t\t\t抖音App视频下载小助手\') 158 + print(\'\t\t作者:Python学习交流群:125240963\') 159 + print(\'*\' * 100) 160 + 161 + 162 +if __name__ == \'__main__\': 163 + douyin = DouYin() 164 + douyin.run()