Skip to content
This repository was archived by the owner on Jun 3, 2025. It is now read-only.

Commit 3e9e322

Browse files
horheynmrahul-tuli
andauthored
Cp chunk download (#471)
* add more info on what chunk is downloading, and make chunk folder foe each file (#469) * Download iso chunk folder (#470) * add more info on what chunk is downloading, and make chunk folder foe each file * fix bug * comments * add .cache/sparsezoo/neuralmagic/ --------- Co-authored-by: Rahul Tuli <rahul@neuralmagic.com>
1 parent b0d0e9f commit 3e9e322

File tree

1 file changed

+45
-12
lines changed

1 file changed

+45
-12
lines changed

src/sparsezoo/utils/download.py

Lines changed: 45 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,13 @@
1616
import concurrent.futures
1717
import logging
1818
import math
19+
import multiprocessing
1920
import os
2021
import re
2122
import shutil
2223
import threading
2324
from dataclasses import dataclass, field
25+
from pathlib import Path
2426
from queue import Queue
2527
from typing import Any, Callable, Dict, Optional
2628

@@ -92,6 +94,26 @@ def __init__(
9294
self.chunk_bytes = chunk_bytes
9395
self.job_queues = Queue()
9496
self._lock = threading.Lock()
97+
self.chunk_download_path = self.get_chunk_download_path(download_path)
98+
99+
def get_chunk_download_path(self, path: str) -> str:
100+
"""Get the path where chunks will be downloaded"""
101+
102+
# make the folder name from the model name and file to be downloaded
103+
stub = path.split(os.path.sep)[-3]
104+
path = "_".join(path.split(os.path.sep)[-2:])
105+
file_name_as_folder = path.replace(".", "_")
106+
107+
# save the chunks on a different folder than the root model folder
108+
return os.path.join(
109+
str(Path.home()),
110+
".cache",
111+
"sparsezoo",
112+
"neuralmagic",
113+
"chunks",
114+
stub,
115+
file_name_as_folder,
116+
)
95117

96118
def is_range_header_supported(self) -> bool:
97119
"""Check if chunck download is supported"""
@@ -148,9 +170,11 @@ def queue_chunk_download_jobs(self) -> None:
148170
The jobs need to be executed by a worker or scheduler that processes the
149171
queued JobQueues.
150172
"""
151-
download_jobs: Queue = JobQueue(description="Downloading Chunks")
173+
file_name = self.download_path.split(os.path.sep)[-1]
174+
download_jobs: Queue = JobQueue(
175+
description=f"Downloading Chunks for {file_name}"
176+
)
152177
num_download_jobs = math.ceil(self.file_size / self.chunk_bytes)
153-
154178
for job_id in range(num_download_jobs):
155179
start_byte = 0 if job_id == 0 else job_id * (self.chunk_bytes) + 1
156180
end_byte = (
@@ -161,8 +185,10 @@ def queue_chunk_download_jobs(self) -> None:
161185
bytes_range = f"bytes={start_byte}-{end_byte}"
162186

163187
func_kwargs = {
164-
"download_path": self.get_chunk_file_path(
165-
f"{job_id:05d}_{bytes_range}"
188+
"download_path": (
189+
os.path.join(
190+
self.chunk_download_path, f"{job_id:05d}_{bytes_range}"
191+
)
166192
),
167193
"headers": {
168194
"Range": bytes_range,
@@ -237,7 +263,7 @@ def queue_jobs(self) -> None:
237263
)
238264
self.job_queues.put(job_queue)
239265

240-
def run(self, num_threads: int = 10) -> None:
266+
def run(self, num_threads: int = 1) -> None:
241267
"""
242268
Executes queued download jobs in parallel using multiple threads.
243269
@@ -250,6 +276,9 @@ def run(self, num_threads: int = 10) -> None:
250276
file chunks in parallel. Defaults to 10.
251277
252278
"""
279+
available_threads = multiprocessing.cpu_count() - threading.active_count()
280+
num_threads = max(available_threads // 2, num_threads)
281+
253282
is_prev_job_queue_success = True
254283
while not self.job_queues.empty() and is_prev_job_queue_success:
255284
job_queue = self.job_queues.get()
@@ -295,23 +324,25 @@ def execute_job_from_queue(self, job_queue: Queue, **kwargs) -> None:
295324
with self._lock:
296325
job: Job = job_queue.get()
297326
success = False
327+
err = ""
298328
while not success and job.retries < job.max_retries:
299329
try:
300330
job.func(**job.func_kwargs, **kwargs)
301331
success = True
302332
except Exception as _err:
333+
err = _err
303334
_LOGGER.debug(
304335
f"{job.retries/self.max_retries}: "
305336
"Failed running {self.func} with kwargs {job.func_kwargs}"
306337
)
307-
_LOGGER.debug(_err)
338+
_LOGGER.error(_err)
308339
job.retries += 1
309340
if job.retries < job.max_retries:
310341
job_queue.put(job)
311342

312343
if not success:
313344
_LOGGER.debug(f"Chunk download failed after {self.max_retries} retries.")
314-
raise ValueError
345+
raise ValueError(err)
315346

316347
def download_file(
317348
self,
@@ -339,7 +370,10 @@ def download_file(
339370
340371
"""
341372
write_chunk_size = min(CHUNK_BYTES, self.file_size)
373+
_LOGGER.debug("creating ", download_path)
374+
342375
create_parent_dirs(download_path)
376+
343377
response = requests.get(
344378
self.url, headers=headers, stream=True, allow_redirects=True
345379
)
@@ -358,11 +392,10 @@ def combine_chunks_and_delete(self, download_path: str, progress_bar: tqdm) -> N
358392
:param progress_bar: tqdm object showing the progress of combining chunks
359393
360394
"""
361-
parent_directory = os.path.dirname(download_path)
362-
chunk_directory = os.path.join(parent_directory, "chunks")
395+
_LOGGER.debug("Combing and deleting ", self.chunk_download_path)
363396

364397
pattern = re.compile(r"\d+_bytes=")
365-
files = os.listdir(chunk_directory)
398+
files = os.listdir(self.chunk_download_path)
366399

367400
chunk_files = [chunk_file for chunk_file in files if pattern.match(chunk_file)]
368401

@@ -371,13 +404,13 @@ def combine_chunks_and_delete(self, download_path: str, progress_bar: tqdm) -> N
371404
create_parent_dirs(self.download_path)
372405
with open(self.download_path, "wb") as combined_file:
373406
for file_path in sorted_chunk_files:
374-
chunk_path = os.path.join(chunk_directory, file_path)
407+
chunk_path = os.path.join(self.chunk_download_path, file_path)
375408
with open(chunk_path, "rb") as infile:
376409
data = infile.read()
377410
combined_file.write(data)
378411
progress_bar.update(len(data))
379412

380-
shutil.rmtree(chunk_directory)
413+
shutil.rmtree(self.chunk_download_path)
381414

382415
def get_chunk_file_path(self, file_range: str) -> str:
383416
"""

0 commit comments

Comments
 (0)