Source code for gitlab_overviewer.services.readme_extraction
1"""
2Utility for extracting interpreted information from README content into ReadmeExtract.
3
4Implements :any:`/specs/spec_readme_extraction` §1-7, covering:
5
6* Class definition and field extraction (§1-2)
7* Authors/supervisors processing (§3)
8* Construction process (§4)
9* Integration with Readme model (§5)
10* Error handling (§6)
11* Examples and non-goals (§7-8)
12"""
13
14from typing import Any, Optional, Set
15import yaml
16from ..models.readme_extract import ReadmeExtract
17from ..config.settings import Settings
18
19
[docs]
20def parse_readme(content: str, mode: str = "full") -> dict[str, Any]:
21 """Parse README content and extract structured information.
22
23 Args:
24 content: The README content to parse
25 mode: Parsing mode ("full", "todo", etc.)
26
27 Returns:
28 Dictionary containing parsed information with keys:
29 - front_matter: Extracted YAML frontmatter
30 - content: Main content without frontmatter
31 """
32 if not content:
33 return {"front_matter": {}, "content": ""}
34
35 # Extract frontmatter
36 front_matter = _extract_frontmatter(content)
37
38 # Extract content (everything after frontmatter)
39 content_lines = content.split("\n")
40 content_start = 0
41
42 if content.startswith("---"):
43 # Find end of frontmatter
44 for i, line in enumerate(content_lines[1:], 1):
45 if line.strip() == "---":
46 content_start = i + 1
47 break
48
49 main_content = "\n".join(content_lines[content_start:]).strip()
50
51 result = {"front_matter": front_matter, "content": main_content}
52
53 # Add mode-specific parsing
54 if mode == "todo":
55 settings = Settings.current()
56 todo_sections = extract_todo_sections(content, settings.todo_keywords)
57 if todo_sections:
58 result["todo"] = todo_sections
59
60 return result
61
62
[docs]
63def extract_readme_data(
64 content: str, todo_keywords: Optional[Set[str]] = None
65) -> ReadmeExtract:
66 """Extract interpreted information from README content and return a ReadmeExtract object.
67
68 Only authors, supervisors and raw front-matter are returned per the strict
69 specification. Any additional metadata is ignored at this stage.
70 """
71 if todo_keywords is None:
72 # Use default todo keywords from settings
73 settings = Settings.current()
74 todo_keywords = set(settings.todo_keywords)
75
76 # Parse frontmatter
77 frontmatter = _extract_frontmatter(content)
78
79 # Process authors and supervisors
80 authors, supervisors = _process_authors(
81 frontmatter.get("authors") or frontmatter.get("author")
82 )
83
84 return ReadmeExtract(
85 authors=authors,
86 supervisors=supervisors,
87 raw_frontmatter=frontmatter,
88 )
89
90
91def _extract_frontmatter(content: str) -> dict[str, Any]:
92 if not content.startswith("---"):
93 return {}
94 try:
95 lines = content.split("\n")
96 if len(lines) < 2:
97 return {}
98 end_index = -1
99 for i, line in enumerate(lines[1:], 1):
100 if line.strip() == "---":
101 end_index = i
102 break
103 if end_index == -1:
104 return {}
105 frontmatter_lines = lines[1:end_index]
106 frontmatter_text = "\n".join(frontmatter_lines)
107 return yaml.safe_load(frontmatter_text) or {}
108 except Exception:
109 return {}
110
111
[docs]
112def extract_first_paragraph(content: str) -> str | None:
113 lines = content.split("\n")
114 start_index = 0
115 if content.startswith("---"):
116 for i, line in enumerate(lines[1:], 1):
117 if line.strip() == "---":
118 start_index = i + 1
119 break
120
121 # Find the first heading and paragraph
122 first_heading = None
123 current_paragraph = []
124 in_paragraph = False
125 in_code_block = False
126
127 for line in lines[start_index:]:
128 stripped = line.strip()
129
130 # Handle code blocks
131 if stripped.startswith("```"):
132 if in_code_block:
133 # End of code block
134 current_paragraph.append(line)
135 in_code_block = False
136 else:
137 # Start of code block
138 if in_paragraph:
139 current_paragraph.append(line)
140 in_code_block = True
141 continue
142
143 if in_code_block:
144 # Inside code block, always include
145 current_paragraph.append(line)
146 continue
147
148 # Skip empty lines
149 if not stripped:
150 if in_paragraph:
151 # End of paragraph
152 break
153 continue
154
155 # Handle headings
156 if stripped.startswith("#"):
157 if first_heading is None:
158 # First heading - include it
159 first_heading = line
160 continue
161 elif in_paragraph:
162 # Second heading - end of paragraph
163 break
164 else:
165 # Skip subsequent headings before paragraph starts
166 continue
167
168 # Start or continue paragraph
169 in_paragraph = True
170 current_paragraph.append(line)
171
172 # Combine heading and paragraph
173 result_parts = []
174 if first_heading:
175 result_parts.append(first_heading)
176 if current_paragraph:
177 result_parts.extend(current_paragraph)
178
179 if result_parts:
180 # Preserve original formatting: only insert newline that already exists
181 if first_heading and current_paragraph:
182 # Preserve blank line only if it existed in original text
183 double_newline = f"{first_heading}\n\n" in content
184 sep = "\n\n" if double_newline else "\n"
185 return f"{first_heading}{sep}" + "\n".join(current_paragraph).strip()
186 return "\n".join(result_parts).strip()
187
188 return None
189
190
[docs]
191def extract_todo_sections(content: str, todo_keywords: list[str]) -> str | None:
192 """Extract TODO sections from README content based on keywords.
193
194 Args:
195 content: The README content to parse
196 todo_keywords: List of keywords to search for (e.g., ["todo", "status", "stand"])
197
198 Returns:
199 Extracted TODO sections as markdown text, or None if no sections found
200 """
201 if not content or not todo_keywords:
202 return None
203
204 lines = content.split("\n")
205 todo_sections = []
206 current_section = []
207 in_todo_section = False
208
209 # Skip frontmatter if present
210 start_index = 0
211 if content.startswith("---"):
212 for i, line in enumerate(lines[1:], 1):
213 if line.strip() == "---":
214 start_index = i + 1
215 break
216
217 for line in lines[start_index:]:
218 stripped = line.strip().lower()
219
220 # Check if this line contains any todo keywords
221 contains_keyword = any(keyword.lower() in stripped for keyword in todo_keywords)
222
223 # Check for section headers that might contain todo keywords
224 is_header = stripped.startswith("#") and contains_keyword
225
226 if is_header or contains_keyword:
227 # Start or continue a todo section
228 if not in_todo_section:
229 in_todo_section = True
230 current_section = []
231
232 current_section.append(line)
233 elif in_todo_section:
234 # Check if we should end the section
235 if stripped.startswith("#") and not contains_keyword:
236 # New header without todo keywords - end current section
237 if current_section:
238 todo_sections.append("\n".join(current_section))
239 current_section = []
240 in_todo_section = False
241 elif not stripped:
242 # Empty line - continue section but mark potential end
243 current_section.append(line)
244 else:
245 # Continue current section
246 current_section.append(line)
247
248 # Add final section if still in progress
249 if in_todo_section and current_section:
250 todo_sections.append("\n".join(current_section))
251
252 if todo_sections:
253 return "\n\n".join(todo_sections)
254
255 return None
256
257
258def _process_authors(authors_data: Any) -> tuple[list[str], list[str]]:
259 if not authors_data:
260 return [], []
261 all_authors = []
262 supervisors = []
263
264 def process_author_item(item: Any) -> tuple[str | None, bool]:
265 if isinstance(item, str):
266 return item, False
267 elif isinstance(item, dict):
268 name = item.get("name")
269 if not name:
270 return None, False
271 roles = item.get("roles", [])
272 if isinstance(roles, str):
273 roles = [roles]
274 elif not isinstance(roles, list):
275 roles = []
276 is_supervisor = "Supervision" in roles
277 return name, is_supervisor
278 else:
279 return None, False
280
281 if isinstance(authors_data, str):
282 name, is_supervisor = process_author_item(authors_data)
283 if name:
284 all_authors.append(name)
285 if is_supervisor:
286 supervisors.append(name)
287 elif isinstance(authors_data, list):
288 for item in authors_data:
289 name, is_supervisor = process_author_item(item)
290 if name:
291 all_authors.append(name)
292 if is_supervisor:
293 supervisors.append(name)
294 elif isinstance(authors_data, dict):
295 name, is_supervisor = process_author_item(authors_data)
296 if name:
297 all_authors.append(name)
298 if is_supervisor:
299 supervisors.append(name)
300 return all_authors, supervisors