1from __future__ import annotations
2import json
3from typing import List
4from ..api.client import GitLabClient
5from ..api.errors import GitLabAPIError, NotFoundError
6from ..models import Group, Project, Issue, Readme
7from ..models.overview_data import OverviewData
8from ..models import mapper as _mapper
9from ..utils.logging import get_logger
10from ..config.settings import Settings
11from ..services.readme_extraction import extract_readme_data
12from ..models.readme_extract import ReadmeExtract
13
14"""
15DataCollector – aggregates GitLab groups, projects, readmes and issues.
16
17Implements :any:`/specs/spec_data_collector.md` §1-4, covering:
18
19* Data aggregation workflow (§1)
20* Failure handling (§2)
21* Concurrency and rate limits (§3)
22* Output contract (§4)
23
24The collector is intentionally synchronous (matching :class:`~gitlab_overviewer.api.client.GitLabClient`) to
25keep control-flow simple for now. It is also a *thin* orchestration layer – the
26heavy lifting (HTTP calls + JSON → domain models) happens inside
27:class:`~gitlab_overviewer.api.client.GitLabClient` and
28:mod:`gitlab_overviewer.models.mapper`. This separation allows the client to
29be stubbed cleanly in unit–tests.
30"""
31
32__all__ = ["DataCollector", "DataCollectorError"]
33
34logger = get_logger(__name__)
35
36
[docs]
37class DataCollectorError(RuntimeError):
38 """Raised when data-collection fails – wraps low-level API errors."""
39
40
[docs]
41class DataCollector: # noqa: D101 – docstring below
42 """Aggregate GitLab entities into :class:`OverviewData` instances.
43
44 Parameters
45 ----------
46 client:
47 Optional pre-configured :class:`~gitlab_overviewer.api.client.GitLabClient`.
48 If *None* a new client is created from default settings.
49 include_subgroups:
50 Whether projects in sub-groups should be traversed. Currently **not**
51 implemented (kept for future compatibility) and will raise
52 :class:`NotImplementedError` if *True*.
53 """
54
[docs]
55 def __init__(
56 self, client: GitLabClient | None = None, *, include_subgroups: bool = False
57 ):
58 if include_subgroups:
59 raise NotImplementedError("Sub-group traversal is not yet supported.")
60 self._client = client or GitLabClient()
61 self._client_injected = client is not None
62
63 # ---------------------------------------------------------------------
64 # Public API
65 # ---------------------------------------------------------------------
66
[docs]
67 def collect(self) -> List[OverviewData]: # noqa: D401
68 """Return a list of :class:`OverviewData` for all accessible projects, using all tokens."""
69
70 overview_rows: list[OverviewData] = []
71 seen_project_ids = set()
72 seen_group_ids = set()
73 settings = (
74 self._client.settings
75 if hasattr(self._client, "settings")
76 else Settings.current()
77 )
78 tokens = getattr(settings, "group_api_key", [])
79 if not tokens:
80 tokens = [None]
81 try:
82 display_shared = getattr(settings, "display_shared", False)
83 # If a client is injected (e.g. in tests), use it for all tokens
84 if self._client_injected:
85 for token in tokens:
86 if token:
87 self._client._client.headers["Authorization"] = (
88 f"Bearer {token}"
89 )
90 groups = self._client.list_groups()
91 for group in groups:
92 if group.id in seen_group_ids:
93 continue
94 seen_group_ids.add(group.id)
95 logger.info(
96 f"Processing group: {group.name} ({getattr(group, 'path', 'N/A')})"
97 )
98 group_id = int(group.id)
99 group_projects = self._client.list_projects(
100 group_id, with_shared=display_shared
101 )
102 logger.info(
103 f" Found {len(group_projects)} projects in group {group.name}"
104 )
105 for project in group_projects:
106 if project.id in seen_project_ids and not display_shared:
107 continue
108 seen_project_ids.add(project.id)
109 overview_rows.append(self._collect_project(group, project))
110 else:
111 for token in tokens:
112 # Create a new client for each token
113 client = type(self._client)()
114 if token:
115 client._client.headers["Authorization"] = f"Bearer {token}"
116 groups = client.list_groups()
117 for group in groups:
118 if group.id in seen_group_ids:
119 continue
120 seen_group_ids.add(group.id)
121 logger.info(
122 f"Processing group: {group.name} ({getattr(group, 'path', 'N/A')})"
123 )
124 group_id = int(group.id)
125 group_projects = client.list_projects(
126 group_id, with_shared=display_shared
127 )
128 logger.info(
129 f" Found {len(group_projects)} projects in group {group.name}"
130 )
131 for project in group_projects:
132 if project.id in seen_project_ids and not display_shared:
133 continue
134 seen_project_ids.add(project.id)
135 overview_rows.append(self._collect_project(group, project))
136 client.close()
137 except GitLabAPIError as exc:
138 raise DataCollectorError(str(exc)) from exc
139 finally:
140 self._client.close()
141
142 return overview_rows
143
144 # ------------------------------------------------------------------
145 # Internal helpers
146 # ------------------------------------------------------------------
147
148 def _collect_project(
149 self, group: Group, project: Project
150 ) -> OverviewData: # noqa: D401
151 # README ----------------------------------------------------------
152 readme: Readme | None = None
153 readme_extract = None
154 try:
155 default_branch = project.default_branch or "main"
156 content = self._client.fetch_file(
157 project.id, "README.md", ref=default_branch
158 )
159 readme_extract = extract_readme_data(content)
160 readme = _mapper.readme_from_str(project.id, content)
161 except NotFoundError as exc:
162 # README truly missing – treat gracefully and continue
163 logger.warning(
164 "No README found for project '%s' (%s) in group '%s': %s",
165 project.name,
166 project.id,
167 group.name,
168 exc,
169 )
170 readme = None
171 readme_extract = ReadmeExtract()
172 except GitLabAPIError as exc:
173 # Any other API error should propagate as DataCollectorError
174 raise DataCollectorError(
175 f"Failed to fetch README for project '{project.name}' ({project.id}): {exc}"
176 ) from exc
177
178 # Issues ----------------------------------------------------------
179 issues: list[Issue] | None
180 try:
181 issues = self._client.list_issues(project.id)
182 except GitLabAPIError as exc:
183 logger.info("Issue list failed for project %s: %s", project.id, exc)
184 issues = None
185
186 # Always populate extra with readme_extract as ReadmeExtract instance
187 extra = readme_extract if readme_extract else ReadmeExtract()
188
189 logger.debug(f"Issues for project {project.id}: {issues}")
190 return OverviewData(
191 group=group, project=project, readme=readme, issues=issues, extra=extra
192 )