-
-
Notifications
You must be signed in to change notification settings - Fork 0
🧹 refactor: reduce complexity in download_worker of git-fetch.py #236
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,6 +19,9 @@ | |
| import os | ||
| import queue | ||
| import sys | ||
| import tempfile | ||
| import unittest | ||
| from unittest import mock | ||
| import urllib.parse | ||
| import urllib.request | ||
| import urllib.error | ||
|
|
@@ -92,6 +95,205 @@ | |
| raise urllib.error.URLError(f"{e} (url: {url})") from e | ||
|
|
||
|
|
||
| class _FakeResponse: | ||
| """Test helper: minimal fake HTTPResponse for process_single_download tests.""" | ||
|
|
||
| def __init__( | ||
| self, | ||
| status: int, | ||
| body: bytes = b"", | ||
| headers: dict[str, str] | None = None, | ||
| chunks: list[bytes] | None = None, | ||
| ) -> None: | ||
| self.status = status | ||
| self._headers = {k.lower(): v for k, v in (headers or {}).items()} | ||
| if chunks is not None: | ||
| self._chunks: list[bytes] = list(chunks) | ||
| else: | ||
| self._chunks = [body] if body else [b""] | ||
|
|
||
| def getheader(self, name: str, default: str | None = None) -> str | None: | ||
| return self._headers.get(name.lower(), default) | ||
|
|
||
| def read(self, _: int = -1) -> bytes: | ||
| if not self._chunks: | ||
| return b"" | ||
| return self._chunks.pop(0) | ||
|
|
||
|
|
||
| class _FakeConnection: | ||
| """Test helper: minimal fake HTTPSConnection for process_single_download tests.""" | ||
|
|
||
| def __init__(self, responses: list[_FakeResponse]) -> None: | ||
| self._responses: list[_FakeResponse] = list(responses) | ||
| self.request_calls: list[tuple[str, str, dict[str, str] | None]] = [] | ||
| self.closed = False | ||
|
|
||
| def request(self, method: str, url: str, headers: dict[str, str] | None = None) -> None: | ||
| self.request_calls.append((method, url, headers)) | ||
|
|
||
| def getresponse(self) -> _FakeResponse: | ||
| return self._responses.pop(0) | ||
|
|
||
| def close(self) -> None: | ||
| self.closed = True | ||
|
|
||
|
|
||
| class ProcessSingleDownloadTests(unittest.TestCase): | ||
| """Unit tests for process_single_download retry and connection handling.""" | ||
|
|
||
| def test_200_writes_file(self) -> None: | ||
| data = b"hello world" | ||
| resp = _FakeResponse(200, chunks=[data, b""]) | ||
| conn = _FakeConnection([resp]) | ||
| with tempfile.TemporaryDirectory() as tmpdir: | ||
| local_path = Path(tmpdir) / "file.txt" | ||
| result_conn = process_single_download( | ||
| conn=conn, | ||
|
Check warning on line 152 in Cachyos/Scripts/WIP/gh/git-fetch.py
|
||
| host="example.com", | ||
| url_path="/path", | ||
| local_path=local_path, | ||
| display_path="file.txt", | ||
| headers={}, | ||
| ) | ||
| self.assertTrue(local_path.exists()) | ||
| self.assertEqual(local_path.read_bytes(), data) | ||
| # Connection should be reused for 200 without Connection: close | ||
| self.assertIs(result_conn, conn) | ||
|
|
||
| def test_5xx_and_429_retries_then_succeeds(self) -> None: | ||
| # First 500, then 429, then 200 with body. | ||
| resp1 = _FakeResponse(500, chunks=[b"", b""]) | ||
| resp2 = _FakeResponse(429, chunks=[b"", b""]) | ||
| data = b"ok" | ||
| resp3 = _FakeResponse(200, chunks=[data, b""]) | ||
| conn = _FakeConnection([resp1, resp2, resp3]) | ||
| with tempfile.TemporaryDirectory() as tmpdir: | ||
| local_path = Path(tmpdir) / "file.txt" | ||
| result_conn = process_single_download( | ||
| conn=conn, | ||
| host="example.com", | ||
| url_path="/retry", | ||
| local_path=local_path, | ||
| display_path="file.txt", | ||
| headers={}, | ||
| ) | ||
| self.assertTrue(local_path.exists()) | ||
| self.assertEqual(local_path.read_bytes(), data) | ||
| # Should have issued three requests due to retries on 500 and 429. | ||
| self.assertEqual(len(conn.request_calls), 3) | ||
| self.assertIs(result_conn, conn) | ||
|
|
||
| def test_connection_close_triggers_new_httpsconnection(self) -> None: | ||
| data = b"data" | ||
| resp = _FakeResponse( | ||
| 200, | ||
| chunks=[data, b""], | ||
| headers={"Connection": "close"}, | ||
| ) | ||
| initial_conn = _FakeConnection([resp]) | ||
| with tempfile.TemporaryDirectory() as tmpdir: | ||
| local_path = Path(tmpdir) / "file.txt" | ||
| with mock.patch("http.client.HTTPSConnection") as MockHTTPSConnection: | ||
| new_conn_instance = mock.Mock(spec=http.client.HTTPSConnection) | ||
| MockHTTPSConnection.return_value = new_conn_instance | ||
| result_conn = process_single_download( | ||
| conn=initial_conn, | ||
|
Check warning on line 201 in Cachyos/Scripts/WIP/gh/git-fetch.py
|
||
| host="example.com", | ||
| url_path="/close", | ||
| local_path=local_path, | ||
| display_path="file.txt", | ||
| headers={}, | ||
| ) | ||
| # File should be written correctly. | ||
| self.assertTrue(local_path.exists()) | ||
| self.assertEqual(local_path.read_bytes(), data) | ||
| # Initial connection should be closed, and a new HTTPSConnection created. | ||
| self.assertTrue(initial_conn.closed) | ||
| MockHTTPSConnection.assert_called_once_with("example.com", timeout=30) | ||
| self.assertIs(result_conn, new_conn_instance) | ||
|
|
||
|
|
||
| def process_single_download( | ||
|
Check warning on line 217 in Cachyos/Scripts/WIP/gh/git-fetch.py
|
||
| conn: http.client.HTTPSConnection, | ||
| host: str, | ||
| url_path: str, | ||
| local_path: Path, | ||
| display_path: str, | ||
| headers: dict[str, str], | ||
| ) -> http.client.HTTPSConnection: | ||
| """Process a single file download with retries.""" | ||
| retries = 3 | ||
| while retries > 0: | ||
| try: | ||
| conn.request("GET", url_path, headers=headers) | ||
| resp = conn.getresponse() | ||
|
|
||
| # Check if the server wants to close the connection | ||
| connection_header = resp.getheader("Connection", "").lower() | ||
| should_close = connection_header == "close" | ||
|
|
||
| if resp.status == 200: | ||
| # Create parent directory to avoid FileNotFoundError | ||
| local_path.parent.mkdir(parents=True, exist_ok=True) | ||
| with open(local_path, "wb") as f: | ||
| while True: | ||
| chunk = resp.read(65536) | ||
| if not chunk: | ||
| break | ||
| f.write(chunk) | ||
| print(f"✓ {display_path}") | ||
|
|
||
| if should_close: | ||
| conn.close() | ||
| conn = http.client.HTTPSConnection(host, timeout=30) | ||
|
|
||
| break | ||
| elif resp.status in (301, 302, 307, 308): | ||
| loc = resp.getheader("Location") | ||
| resp.read() # Consume body | ||
| if loc: | ||
| print(f"✗ {display_path}: Redirect to {loc} not handled in persistent mode") | ||
| else: | ||
| print(f"✗ {display_path}: HTTP {resp.status}") | ||
|
|
||
| if should_close: | ||
| conn.close() | ||
| conn = http.client.HTTPSConnection(host, timeout=30) | ||
| break | ||
| else: | ||
| print(f"✗ {display_path}: HTTP {resp.status}") | ||
| resp.read() # Consume body | ||
| if should_close: | ||
| conn.close() | ||
| conn = http.client.HTTPSConnection(host, timeout=30) | ||
|
|
||
| # Non-retriable client errors: fail fast | ||
| if resp.status in (401, 403, 404): | ||
| break | ||
|
|
||
| # Retry on transient server errors and rate limiting | ||
| if 500 <= resp.status < 600 or resp.status == 429: | ||
| retries -= 1 | ||
| if retries > 0: | ||
| continue | ||
| # Out of retries, give up | ||
| break | ||
|
|
||
| # Default: treat other statuses as non-retriable | ||
| break | ||
| except (http.client.HTTPException, OSError) as e: | ||
| # Connection might have been closed by server unexpectedly | ||
| conn.close() | ||
| retries -= 1 | ||
| if retries > 0: | ||
| conn = http.client.HTTPSConnection(host, timeout=30) | ||
| else: | ||
| print(f"✗ {display_path}: {e}") | ||
|
|
||
| return conn | ||
|
Comment on lines
+217
to
+294
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. While this refactoring is a good step in reducing complexity in
|
||
|
|
||
|
|
||
| def download_worker(host: str, file_q: queue.Queue, headers: dict[str, str]) -> None: | ||
| """Worker thread: process files from queue using a persistent connection.""" | ||
| # Make a per-thread copy so we don't mutate a shared headers dict. | ||
|
|
@@ -111,78 +313,9 @@ | |
|
|
||
| try: | ||
| url_path, local_path, display_path = item | ||
|
|
||
| # Process download | ||
| retries = 3 | ||
| while retries > 0: | ||
| try: | ||
| conn.request("GET", url_path, headers=headers) | ||
| resp = conn.getresponse() | ||
|
|
||
| # Check if the server wants to close the connection | ||
| connection_header = resp.getheader("Connection", "").lower() | ||
| should_close = connection_header == "close" | ||
|
|
||
| if resp.status == 200: | ||
| # Create parent directory to avoid FileNotFoundError | ||
| local_path.parent.mkdir(parents=True, exist_ok=True) | ||
| with open(local_path, "wb") as f: | ||
| while True: | ||
| chunk = resp.read(65536) | ||
| if not chunk: | ||
| break | ||
| f.write(chunk) | ||
| print(f"✓ {display_path}") | ||
|
|
||
| if should_close: | ||
| conn.close() | ||
| # Reconnect for the next file in the queue | ||
| conn = http.client.HTTPSConnection(host, timeout=30) | ||
|
|
||
| break | ||
| elif resp.status in (301, 302, 307, 308): | ||
| loc = resp.getheader("Location") | ||
| resp.read() # Consume body | ||
| if loc: | ||
| print( | ||
| f"✗ {display_path}: Redirect to {loc} not handled in persistent mode" | ||
| ) | ||
| else: | ||
| print(f"✗ {display_path}: HTTP {resp.status}") | ||
|
|
||
| if should_close: | ||
| conn.close() | ||
| conn = http.client.HTTPSConnection(host, timeout=30) | ||
| break | ||
| else: | ||
| print(f"✗ {display_path}: HTTP {resp.status}") | ||
| resp.read() # Consume body | ||
| if should_close: | ||
| conn.close() | ||
| conn = http.client.HTTPSConnection(host, timeout=30) | ||
|
|
||
| # Non-retriable client errors: fail fast | ||
| if resp.status in (401, 403, 404): | ||
| break | ||
|
|
||
| # Retry on transient server errors and rate limiting | ||
| if 500 <= resp.status < 600 or resp.status == 429: | ||
| retries -= 1 | ||
| if retries > 0: | ||
| continue | ||
| # Out of retries, give up | ||
| break | ||
|
|
||
| # Default: treat other statuses as non-retriable | ||
| break | ||
| except (http.client.HTTPException, OSError) as e: | ||
| # Connection might have been closed by server unexpectedly | ||
| conn.close() | ||
| retries -= 1 | ||
| if retries > 0: | ||
| conn = http.client.HTTPSConnection(host, timeout=30) | ||
| else: | ||
| print(f"✗ {display_path}: {e}") | ||
| conn = process_single_download( | ||
| conn, host, url_path, local_path, display_path, headers | ||
| ) | ||
| finally: | ||
| file_q.task_done() | ||
| finally: | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.