Skip to content

thread_count开启多个线程,请求下载视频重复,应该如何保证不会重复进行请求同一个下载链接? #272

@328477124

Description

@328477124

import feapder
import requests
import re
from tqdm import tqdm
import os

class AirSpiderDemo(feapder.AirSpider):
videoList = []
requested_urls = set() # 用于跟踪已经请求过的URL
headers = {
'sec-ch-ua': '"Chromium";v="121", "Not A(Brand";v="99"',
'Referer': 'https://www.zhifei100.com/',
'sec-ch-ua-mobile': '?0',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 SamanthaDoubao/1.33.9',
'sec-ch-ua-platform': '"Windows"'
}
def start_requests(self):
for video in self.videoList:
# 视频附件
attachmentUrl = video.get('attachmentUrl')
# 视频名称
videoName = video.get('name')
# videoUrl
videoUrl = video.get('videoUrl')
videoInfo = {
"attachmentUrl": attachmentUrl,
"videoName": videoName,
"videoUrl": videoUrl
}
print(attachmentUrl not in self.requested_urls)
if attachmentUrl and type(attachmentUrl) == str and attachmentUrl not in self.requested_urls:
self.requested_urls.add(attachmentUrl) # 将URL添加到集合中
yield feapder.Request(attachmentUrl, callback=self.parse_save_pdf, videoInfo=videoInfo)
else:
print(f"附件url请求错误或已请求: {attachmentUrl}")
if videoUrl and type(videoUrl) == str and videoUrl not in self.requested_urls:
print(videoUrl not in self.requested_urls)
self.requested_urls.add(videoUrl) # 将URL添加到集合中
yield feapder.Request(videoUrl, callback=self.parse_save_video, videoInfo=videoInfo)
else:
print(f"视频请求错误或已请求: {videoUrl}")
def parse_save_video(self, request,response):
print("parse_save_video")
prefer_url=request.videoInfo.get('videoUrl').split('index.m3u8')[0]
# 进行匹配ts文件
ts_list = re.findall("#EXTINF:.,\n(.)", response.text)
# 进行拼接ts文件请求链接
ts_url_list=[prefer_url+i for i in ts_list]
# 课程名字
course_name=request.videoInfo.get('videoName')
# 进行判断是否存在这个目录
if not os.path.exists(f"courses/{course_name}"):
os.makedirs(f"courses/{course_name}")
# 进行保存视频
with tqdm(total=len(ts_url_list), desc=f"{course_name},下载进度", unit="MB") as pbar:
# 进行获取视频
for ts in ts_url_list:
response = requests.get(ts, headers=self.headers)
with open(f"courses/{course_name}/{course_name}.mp4", "ab") as f:
f.write(response.content)
pbar.update(1)

# 保存视频
def parse_save_pdf(self, request, response):
    print("parse_save_pdf")
    """
    保存PDF附件,将其保存到对应课程文件夹下,方便统一管理,并添加进度条展示保存进度。
    """
    # 课程名字
    course_name = request.videoInfo.get('videoName')
    # 进行判断是否存在这个目录
    if not os.path.exists(f"courses/{course_name}"):
        os.makedirs(f"courses/{course_name}")
    # 获取文件总大小(用于进度条显示总进度)
    file_size = len(response.content)
    # 设置每次写入的块大小(可根据实际情况调整)
    block_size = 1024
    # 使用tqdm创建进度条对象,设置总进度为文件总大小,单位为字节('B'),描述信息为保存PDF的相关提示
    with tqdm(total=file_size, unit='B', unit_scale=True, desc=f'Saving PDF for {course_name}') as pbar:
        # 进行保存附件
        with open(f"courses/{course_name}/{course_name}.pdf", "wb") as f:
            # 已写入的字节数
            written_size = 0
            while written_size < file_size:
                # 每次写入的实际字节数,取剩余字节数和块大小的较小值
                write_now = min(block_size, file_size - written_size)
                f.write(response.content[written_size:written_size + write_now])
                written_size += write_now
                # 更新进度条
                pbar.update(write_now)

if name == "main":
url = "https:"
headers = {
............
}

data = {
    "id": "725"
}
cookies = {
   '''''''''''
}
result = requests.post(url, data=data, headers=headers, cookies=cookies).json()
result = result.get("result")
# 获取到视频列表
videoList = result.get('videoList')
AirSpiderDemo.videoList = videoList  # 将获取到的视频列表赋值给类属性
AirSpiderDemo(thread_count=8).start()

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions