Source code for validate_actions.globals.web_fetcher

"""WebFetcher module for GitHub API interaction."""
import time
from abc import ABC, abstractmethod
from typing import Dict, Optional

import requests


[docs] class WebFetcher(ABC): """Abstract interface for web fetching. This interface defines the contract for HTTP clients used throughout the validate-actions tool. Examples: Basic usage pattern: >>> fetcher = SomeWebFetcherImplementation() >>> response = fetcher.fetch('https://example.com/api/data') >>> if response and response.status_code == 200: ... data = response.json() """
[docs] @abstractmethod def fetch(self, url: str) -> Optional[requests.Response]: """Fetch a URL and return the HTTP response. Args: url: The URL to fetch. Should be a valid HTTP/HTTPS URL. Returns: The HTTP response object if successful, None if the request failed after all retries or encountered an unrecoverable error. """ pass
[docs] class CachedWebFetcher(WebFetcher): """Implementation of WebFetcher with caching and retry logic. This implementation provides robust HTTP fetching with the following features: - **Response Caching**: Successful responses are cached in memory to avoid redundant network requests during a single validation run. - **Retry Logic**: Failed requests are retried with exponential backoff to handle transient network issues. - **Timeout Handling**: Configurable request timeouts prevent hanging on slow or unresponsive servers. - **Session Reuse**: Reuses HTTP connections for better performance when making multiple requests. This class is specifically designed for fetching GitHub Actions metadata and other external resources needed for workflow validation. """ def __init__( self, session: Optional[requests.Session] = None, max_retries: int = 3, request_timeout: int = 1, retry_backoff_factor: float = 0.01, github_token: Optional[str] = None, ) -> None: """Initialize the WebFetcher with configurable retry and timeout settings. Args: session: Optional requests.Session to use. If None, a new session will be created. Useful for customizing headers, authentication, or other session-level configuration. max_retries: Maximum number of retry attempts for failed requests. Default is 3. Set to 0 to disable retries. request_timeout: Timeout in seconds for each HTTP request. Default is 10 seconds. Applies to both connection and read timeouts. retry_backoff_factor: Multiplier for exponential backoff between retries. Default is 1.5. Sleep time = backoff_factor ^ attempt_number. Note: The cache is initialized as empty and will be populated as requests are made. Cache entries persist for the lifetime of the WebFetcher instance. """ self.cache: Dict[str, Optional[requests.Response]] = {} self.session = session or requests.Session() self.max_retries = max_retries self.request_timeout = request_timeout self.retry_backoff_factor = retry_backoff_factor if github_token: self.session.headers.update({"Authorization": f"token {github_token}"})
[docs] def fetch(self, url: str) -> Optional[requests.Response]: """Fetch a URL with caching and intelligent retry logic. This method implements a robust HTTP fetching strategy: 1. **Cache Check**: First checks if the URL has been fetched before and returns the cached response if available. 2. **HTTP Request**: Makes an HTTP GET request with the configured timeout. 3. **Intelligent Retry Logic**: Only retries on errors that might be transient: - Network errors (timeouts, connection failures) - Server errors (5xx status codes) - Rate limiting (429 status code) 4. **No Retry on Permanent Errors**: Client errors (4xx except 429) indicate permanent issues and are not retried. 5. **Cache Storage**: Both successful and permanently failed requests are cached. Args: url: The URL to fetch. Must be a valid HTTP or HTTPS URL. Returns: The HTTP response object if the request succeeded (status 2xx), or None if the request failed permanently or after all retries. """ if url in self.cache: return self.cache[url] for attempt in range(self.max_retries + 1): try: response = self.session.get(url, timeout=self.request_timeout) # Check for permanent client errors that shouldn't be retried if self._is_permanent_client_error(response.status_code): self.cache[url] = None return None response.raise_for_status() self.cache[url] = response return response except (requests.ConnectionError, requests.Timeout): # Network errors are retryable if attempt < self.max_retries: time.sleep(self.retry_backoff_factor) except requests.HTTPError: # HTTP errors (4xx, 5xx) are handled above via status code check if attempt < self.max_retries: time.sleep(self.retry_backoff_factor) except requests.RequestException: # Other request exceptions are not retried break # Cache the failure to avoid repeated attempts self.cache[url] = None return None
def _is_permanent_client_error(self, status_code: int) -> bool: """Check if an HTTP status code represents a permanent client error. Permanent client errors should not be retried because they indicate problems with the request itself (wrong URL, missing auth, etc.) rather than transient network or server issues. Args: status_code: HTTP status code from the response Returns: True if this is a permanent client error that should not be retried """ permanent_errors = { 400, # Bad Request - malformed request 401, # Unauthorized - missing/invalid auth 403, # Forbidden - insufficient permissions 404, # Not Found - resource doesn't exist 405, # Method Not Allowed - wrong HTTP method 409, # Conflict - state conflict 410, # Gone - resource permanently removed 422, # Unprocessable Entity - invalid request data } return status_code in permanent_errors
[docs] def clear_cache(self) -> None: """Clear all cached HTTP responses.""" self.cache.clear()