Source code for gitlab_overviewer.services.data_collector

  1from __future__ import annotations
  2import json
  3from typing import List
  4from ..api.client import GitLabClient
  5from ..api.errors import GitLabAPIError, NotFoundError
  6from ..models import Group, Project, Issue, Readme
  7from ..models.overview_data import OverviewData
  8from ..models import mapper as _mapper
  9from ..utils.logging import get_logger
 10from ..config.settings import Settings
 11from ..services.readme_extraction import extract_readme_data
 12from ..models.readme_extract import ReadmeExtract
 13
 14"""
 15DataCollector – aggregates GitLab groups, projects, readmes and issues.
 16
 17Implements :any:`/specs/spec_data_collector.md` §1-4, covering:
 18
 19* Data aggregation workflow (§1)
 20* Failure handling (§2)
 21* Concurrency and rate limits (§3)
 22* Output contract (§4)
 23
 24The collector is intentionally synchronous (matching :class:`~gitlab_overviewer.api.client.GitLabClient`) to
 25keep control-flow simple for now.  It is also a *thin* orchestration layer – the
 26heavy lifting (HTTP calls + JSON → domain models) happens inside
 27:class:`~gitlab_overviewer.api.client.GitLabClient` and
 28:mod:`gitlab_overviewer.models.mapper`.  This separation allows the client to
 29be stubbed cleanly in unit–tests.
 30"""
 31
 32__all__ = ["DataCollector", "DataCollectorError"]
 33
 34logger = get_logger(__name__)
 35
 36
[docs] 37class DataCollectorError(RuntimeError): 38 """Raised when data-collection fails – wraps low-level API errors."""
39 40
[docs] 41class DataCollector: # noqa: D101 – docstring below 42 """Aggregate GitLab entities into :class:`OverviewData` instances. 43 44 Parameters 45 ---------- 46 client: 47 Optional pre-configured :class:`~gitlab_overviewer.api.client.GitLabClient`. 48 If *None* a new client is created from default settings. 49 include_subgroups: 50 Whether projects in sub-groups should be traversed. Currently **not** 51 implemented (kept for future compatibility) and will raise 52 :class:`NotImplementedError` if *True*. 53 """ 54
[docs] 55 def __init__( 56 self, client: GitLabClient | None = None, *, include_subgroups: bool = False 57 ): 58 if include_subgroups: 59 raise NotImplementedError("Sub-group traversal is not yet supported.") 60 self._client = client or GitLabClient() 61 self._client_injected = client is not None
62 63 # --------------------------------------------------------------------- 64 # Public API 65 # --------------------------------------------------------------------- 66
[docs] 67 def collect(self) -> List[OverviewData]: # noqa: D401 68 """Return a list of :class:`OverviewData` for all accessible projects, using all tokens.""" 69 70 overview_rows: list[OverviewData] = [] 71 seen_project_ids = set() 72 seen_group_ids = set() 73 settings = ( 74 self._client.settings 75 if hasattr(self._client, "settings") 76 else Settings.current() 77 ) 78 tokens = getattr(settings, "group_api_key", []) 79 if not tokens: 80 tokens = [None] 81 try: 82 display_shared = getattr(settings, "display_shared", False) 83 # If a client is injected (e.g. in tests), use it for all tokens 84 if self._client_injected: 85 for token in tokens: 86 if token: 87 self._client._client.headers["Authorization"] = ( 88 f"Bearer {token}" 89 ) 90 groups = self._client.list_groups() 91 for group in groups: 92 if group.id in seen_group_ids: 93 continue 94 seen_group_ids.add(group.id) 95 logger.info( 96 f"Processing group: {group.name} ({getattr(group, 'path', 'N/A')})" 97 ) 98 group_id = int(group.id) 99 group_projects = self._client.list_projects( 100 group_id, with_shared=display_shared 101 ) 102 logger.info( 103 f" Found {len(group_projects)} projects in group {group.name}" 104 ) 105 for project in group_projects: 106 if project.id in seen_project_ids and not display_shared: 107 continue 108 seen_project_ids.add(project.id) 109 overview_rows.append(self._collect_project(group, project)) 110 else: 111 for token in tokens: 112 # Create a new client for each token 113 client = type(self._client)() 114 if token: 115 client._client.headers["Authorization"] = f"Bearer {token}" 116 groups = client.list_groups() 117 for group in groups: 118 if group.id in seen_group_ids: 119 continue 120 seen_group_ids.add(group.id) 121 logger.info( 122 f"Processing group: {group.name} ({getattr(group, 'path', 'N/A')})" 123 ) 124 group_id = int(group.id) 125 group_projects = client.list_projects( 126 group_id, with_shared=display_shared 127 ) 128 logger.info( 129 f" Found {len(group_projects)} projects in group {group.name}" 130 ) 131 for project in group_projects: 132 if project.id in seen_project_ids and not display_shared: 133 continue 134 seen_project_ids.add(project.id) 135 overview_rows.append(self._collect_project(group, project)) 136 client.close() 137 except GitLabAPIError as exc: 138 raise DataCollectorError(str(exc)) from exc 139 finally: 140 self._client.close() 141 142 return overview_rows
143 144 # ------------------------------------------------------------------ 145 # Internal helpers 146 # ------------------------------------------------------------------ 147 148 def _collect_project( 149 self, group: Group, project: Project 150 ) -> OverviewData: # noqa: D401 151 # README ---------------------------------------------------------- 152 readme: Readme | None = None 153 readme_extract = None 154 try: 155 default_branch = project.default_branch or "main" 156 content = self._client.fetch_file( 157 project.id, "README.md", ref=default_branch 158 ) 159 readme_extract = extract_readme_data(content) 160 readme = _mapper.readme_from_str(project.id, content) 161 except NotFoundError as exc: 162 # README truly missing – treat gracefully and continue 163 logger.warning( 164 "No README found for project '%s' (%s) in group '%s': %s", 165 project.name, 166 project.id, 167 group.name, 168 exc, 169 ) 170 readme = None 171 readme_extract = ReadmeExtract() 172 except GitLabAPIError as exc: 173 # Any other API error should propagate as DataCollectorError 174 raise DataCollectorError( 175 f"Failed to fetch README for project '{project.name}' ({project.id}): {exc}" 176 ) from exc 177 178 # Issues ---------------------------------------------------------- 179 issues: list[Issue] | None 180 try: 181 issues = self._client.list_issues(project.id) 182 except GitLabAPIError as exc: 183 logger.info("Issue list failed for project %s: %s", project.id, exc) 184 issues = None 185 186 # Always populate extra with readme_extract as ReadmeExtract instance 187 extra = readme_extract if readme_extract else ReadmeExtract() 188 189 logger.debug(f"Issues for project {project.id}: {issues}") 190 return OverviewData( 191 group=group, project=project, readme=readme, issues=issues, extra=extra 192 )