Source code for gitlab_overviewer.services.readme_extraction

  1"""
  2Utility for extracting interpreted information from README content into ReadmeExtract.
  3
  4Implements :any:`/specs/spec_readme_extraction` §1-7, covering:
  5
  6* Class definition and field extraction (§1-2)
  7* Authors/supervisors processing (§3)
  8* Construction process (§4)
  9* Integration with Readme model (§5)
 10* Error handling (§6)
 11* Examples and non-goals (§7-8)
 12"""
 13
 14from typing import Any, Optional, Set
 15import yaml
 16from ..models.readme_extract import ReadmeExtract
 17from ..config.settings import Settings
 18
 19

[docs]
 20def parse_readme(content: str, mode: str = "full") -> dict[str, Any]:
 21    """Parse README content and extract structured information.
 22
 23    Args:
 24        content: The README content to parse
 25        mode: Parsing mode ("full", "todo", etc.)
 26
 27    Returns:
 28        Dictionary containing parsed information with keys:
 29        - front_matter: Extracted YAML frontmatter
 30        - content: Main content without frontmatter
 31    """
 32    if not content:
 33        return {"front_matter": {}, "content": ""}
 34
 35    # Extract frontmatter
 36    front_matter = _extract_frontmatter(content)
 37
 38    # Extract content (everything after frontmatter)
 39    content_lines = content.split("\n")
 40    content_start = 0
 41
 42    if content.startswith("---"):
 43        # Find end of frontmatter
 44        for i, line in enumerate(content_lines[1:], 1):
 45            if line.strip() == "---":
 46                content_start = i + 1
 47                break
 48
 49    main_content = "\n".join(content_lines[content_start:]).strip()
 50
 51    result = {"front_matter": front_matter, "content": main_content}
 52
 53    # Add mode-specific parsing
 54    if mode == "todo":
 55        settings = Settings.current()
 56        todo_sections = extract_todo_sections(content, settings.todo_keywords)
 57        if todo_sections:
 58            result["todo"] = todo_sections
 59
 60    return result

 61
 62

[docs]
 63def extract_readme_data(
 64    content: str, todo_keywords: Optional[Set[str]] = None
 65) -> ReadmeExtract:
 66    """Extract interpreted information from README content and return a ReadmeExtract object.
 67
 68    Only authors, supervisors and raw front-matter are returned per the strict
 69    specification. Any additional metadata is ignored at this stage.
 70    """
 71    if todo_keywords is None:
 72        # Use default todo keywords from settings
 73        settings = Settings.current()
 74        todo_keywords = set(settings.todo_keywords)
 75
 76    # Parse frontmatter
 77    frontmatter = _extract_frontmatter(content)
 78
 79    # Process authors and supervisors
 80    authors, supervisors = _process_authors(
 81        frontmatter.get("authors") or frontmatter.get("author")
 82    )
 83
 84    return ReadmeExtract(
 85        authors=authors,
 86        supervisors=supervisors,
 87        raw_frontmatter=frontmatter,
 88    )

 89
 90
 91def _extract_frontmatter(content: str) -> dict[str, Any]:
 92    if not content.startswith("---"):
 93        return {}
 94    try:
 95        lines = content.split("\n")
 96        if len(lines) < 2:
 97            return {}
 98        end_index = -1
 99        for i, line in enumerate(lines[1:], 1):
100            if line.strip() == "---":
101                end_index = i
102                break
103        if end_index == -1:
104            return {}
105        frontmatter_lines = lines[1:end_index]
106        frontmatter_text = "\n".join(frontmatter_lines)
107        return yaml.safe_load(frontmatter_text) or {}
108    except Exception:
109        return {}
110
111

[docs]
112def extract_first_paragraph(content: str) -> str | None:
113    lines = content.split("\n")
114    start_index = 0
115    if content.startswith("---"):
116        for i, line in enumerate(lines[1:], 1):
117            if line.strip() == "---":
118                start_index = i + 1
119                break
120
121    # Find the first heading and paragraph
122    first_heading = None
123    current_paragraph = []
124    in_paragraph = False
125    in_code_block = False
126
127    for line in lines[start_index:]:
128        stripped = line.strip()
129
130        # Handle code blocks
131        if stripped.startswith("```"):
132            if in_code_block:
133                # End of code block
134                current_paragraph.append(line)
135                in_code_block = False
136            else:
137                # Start of code block
138                if in_paragraph:
139                    current_paragraph.append(line)
140                in_code_block = True
141            continue
142
143        if in_code_block:
144            # Inside code block, always include
145            current_paragraph.append(line)
146            continue
147
148        # Skip empty lines
149        if not stripped:
150            if in_paragraph:
151                # End of paragraph
152                break
153            continue
154
155        # Handle headings
156        if stripped.startswith("#"):
157            if first_heading is None:
158                # First heading - include it
159                first_heading = line
160                continue
161            elif in_paragraph:
162                # Second heading - end of paragraph
163                break
164            else:
165                # Skip subsequent headings before paragraph starts
166                continue
167
168        # Start or continue paragraph
169        in_paragraph = True
170        current_paragraph.append(line)
171
172    # Combine heading and paragraph
173    result_parts = []
174    if first_heading:
175        result_parts.append(first_heading)
176    if current_paragraph:
177        result_parts.extend(current_paragraph)
178
179    if result_parts:
180        # Preserve original formatting: only insert newline that already exists
181        if first_heading and current_paragraph:
182            # Preserve blank line only if it existed in original text
183            double_newline = f"{first_heading}\n\n" in content
184            sep = "\n\n" if double_newline else "\n"
185            return f"{first_heading}{sep}" + "\n".join(current_paragraph).strip()
186        return "\n".join(result_parts).strip()
187
188    return None

189
190

[docs]
191def extract_todo_sections(content: str, todo_keywords: list[str]) -> str | None:
192    """Extract TODO sections from README content based on keywords.
193
194    Args:
195        content: The README content to parse
196        todo_keywords: List of keywords to search for (e.g., ["todo", "status", "stand"])
197
198    Returns:
199        Extracted TODO sections as markdown text, or None if no sections found
200    """
201    if not content or not todo_keywords:
202        return None
203
204    lines = content.split("\n")
205    todo_sections = []
206    current_section = []
207    in_todo_section = False
208
209    # Skip frontmatter if present
210    start_index = 0
211    if content.startswith("---"):
212        for i, line in enumerate(lines[1:], 1):
213            if line.strip() == "---":
214                start_index = i + 1
215                break
216
217    for line in lines[start_index:]:
218        stripped = line.strip().lower()
219
220        # Check if this line contains any todo keywords
221        contains_keyword = any(keyword.lower() in stripped for keyword in todo_keywords)
222
223        # Check for section headers that might contain todo keywords
224        is_header = stripped.startswith("#") and contains_keyword
225
226        if is_header or contains_keyword:
227            # Start or continue a todo section
228            if not in_todo_section:
229                in_todo_section = True
230                current_section = []
231
232            current_section.append(line)
233        elif in_todo_section:
234            # Check if we should end the section
235            if stripped.startswith("#") and not contains_keyword:
236                # New header without todo keywords - end current section
237                if current_section:
238                    todo_sections.append("\n".join(current_section))
239                current_section = []
240                in_todo_section = False
241            elif not stripped:
242                # Empty line - continue section but mark potential end
243                current_section.append(line)
244            else:
245                # Continue current section
246                current_section.append(line)
247
248    # Add final section if still in progress
249    if in_todo_section and current_section:
250        todo_sections.append("\n".join(current_section))
251
252    if todo_sections:
253        return "\n\n".join(todo_sections)
254
255    return None

256
257
258def _process_authors(authors_data: Any) -> tuple[list[str], list[str]]:
259    if not authors_data:
260        return [], []
261    all_authors = []
262    supervisors = []
263
264    def process_author_item(item: Any) -> tuple[str | None, bool]:
265        if isinstance(item, str):
266            return item, False
267        elif isinstance(item, dict):
268            name = item.get("name")
269            if not name:
270                return None, False
271            roles = item.get("roles", [])
272            if isinstance(roles, str):
273                roles = [roles]
274            elif not isinstance(roles, list):
275                roles = []
276            is_supervisor = "Supervision" in roles
277            return name, is_supervisor
278        else:
279            return None, False
280
281    if isinstance(authors_data, str):
282        name, is_supervisor = process_author_item(authors_data)
283        if name:
284            all_authors.append(name)
285            if is_supervisor:
286                supervisors.append(name)
287    elif isinstance(authors_data, list):
288        for item in authors_data:
289            name, is_supervisor = process_author_item(item)
290            if name:
291                all_authors.append(name)
292                if is_supervisor:
293                    supervisors.append(name)
294    elif isinstance(authors_data, dict):
295        name, is_supervisor = process_author_item(authors_data)
296        if name:
297            all_authors.append(name)
298            if is_supervisor:
299                supervisors.append(name)
300    return all_authors, supervisors