Source code for validate_actions.globals.web_fetcher
"""WebFetcher module for GitHub API interaction."""
import time
from abc import ABC, abstractmethod
from typing import Dict, Optional
import requests
[docs]
class WebFetcher(ABC):
"""Abstract interface for web fetching.
This interface defines the contract for HTTP clients used throughout
the validate-actions tool.
Examples:
Basic usage pattern:
>>> fetcher = SomeWebFetcherImplementation()
>>> response = fetcher.fetch('https://example.com/api/data')
>>> if response and response.status_code == 200:
... data = response.json()
"""
[docs]
@abstractmethod
def fetch(self, url: str) -> Optional[requests.Response]:
"""Fetch a URL and return the HTTP response.
Args:
url: The URL to fetch. Should be a valid HTTP/HTTPS URL.
Returns:
The HTTP response object if successful, None if the request
failed after all retries or encountered an unrecoverable error.
"""
pass
[docs]
class CachedWebFetcher(WebFetcher):
"""Implementation of WebFetcher with caching and retry logic.
This implementation provides robust HTTP fetching with the following features:
- **Response Caching**: Successful responses are cached in memory to avoid
redundant network requests during a single validation run.
- **Retry Logic**: Failed requests are retried with exponential backoff
to handle transient network issues.
- **Timeout Handling**: Configurable request timeouts prevent hanging
on slow or unresponsive servers.
- **Session Reuse**: Reuses HTTP connections for better performance
when making multiple requests.
This class is specifically designed for fetching GitHub Actions metadata
and other external resources needed for workflow validation.
"""
def __init__(
self,
session: Optional[requests.Session] = None,
max_retries: int = 3,
request_timeout: int = 1,
retry_backoff_factor: float = 0.01,
github_token: Optional[str] = None,
) -> None:
"""Initialize the WebFetcher with configurable retry and timeout settings.
Args:
session: Optional requests.Session to use. If None, a new session
will be created. Useful for customizing headers, authentication,
or other session-level configuration.
max_retries: Maximum number of retry attempts for failed requests.
Default is 3. Set to 0 to disable retries.
request_timeout: Timeout in seconds for each HTTP request.
Default is 10 seconds. Applies to both connection and read timeouts.
retry_backoff_factor: Multiplier for exponential backoff between retries.
Default is 1.5. Sleep time = backoff_factor ^ attempt_number.
Note:
The cache is initialized as empty and will be populated as requests
are made. Cache entries persist for the lifetime of the WebFetcher instance.
"""
self.cache: Dict[str, Optional[requests.Response]] = {}
self.session = session or requests.Session()
self.max_retries = max_retries
self.request_timeout = request_timeout
self.retry_backoff_factor = retry_backoff_factor
if github_token:
self.session.headers.update({"Authorization": f"token {github_token}"})
[docs]
def fetch(self, url: str) -> Optional[requests.Response]:
"""Fetch a URL with caching and intelligent retry logic.
This method implements a robust HTTP fetching strategy:
1. **Cache Check**: First checks if the URL has been fetched before
and returns the cached response if available.
2. **HTTP Request**: Makes an HTTP GET request with the configured timeout.
3. **Intelligent Retry Logic**: Only retries on errors that might be transient:
- Network errors (timeouts, connection failures)
- Server errors (5xx status codes)
- Rate limiting (429 status code)
4. **No Retry on Permanent Errors**: Client errors (4xx except 429) indicate
permanent issues and are not retried.
5. **Cache Storage**: Both successful and permanently failed requests are cached.
Args:
url: The URL to fetch. Must be a valid HTTP or HTTPS URL.
Returns:
The HTTP response object if the request succeeded (status 2xx),
or None if the request failed permanently or after all retries.
"""
if url in self.cache:
return self.cache[url]
for attempt in range(self.max_retries + 1):
try:
response = self.session.get(url, timeout=self.request_timeout)
# Check for permanent client errors that shouldn't be retried
if self._is_permanent_client_error(response.status_code):
self.cache[url] = None
return None
response.raise_for_status()
self.cache[url] = response
return response
except (requests.ConnectionError, requests.Timeout):
# Network errors are retryable
if attempt < self.max_retries:
time.sleep(self.retry_backoff_factor)
except requests.HTTPError:
# HTTP errors (4xx, 5xx) are handled above via status code check
if attempt < self.max_retries:
time.sleep(self.retry_backoff_factor)
except requests.RequestException:
# Other request exceptions are not retried
break
# Cache the failure to avoid repeated attempts
self.cache[url] = None
return None
def _is_permanent_client_error(self, status_code: int) -> bool:
"""Check if an HTTP status code represents a permanent client error.
Permanent client errors should not be retried because they indicate
problems with the request itself (wrong URL, missing auth, etc.) rather
than transient network or server issues.
Args:
status_code: HTTP status code from the response
Returns:
True if this is a permanent client error that should not be retried
"""
permanent_errors = {
400, # Bad Request - malformed request
401, # Unauthorized - missing/invalid auth
403, # Forbidden - insufficient permissions
404, # Not Found - resource doesn't exist
405, # Method Not Allowed - wrong HTTP method
409, # Conflict - state conflict
410, # Gone - resource permanently removed
422, # Unprocessable Entity - invalid request data
}
return status_code in permanent_errors
[docs]
def clear_cache(self) -> None:
"""Clear all cached HTTP responses."""
self.cache.clear()