Source code for gitlab_overviewer.services.readme_extraction

  1"""
  2Utility for extracting interpreted information from README content into ReadmeExtract.
  3
  4Implements :any:`/specs/spec_readme_extraction` §1-7, covering:
  5
  6* Class definition and field extraction (§1-2)
  7* Authors/supervisors processing (§3)
  8* Construction process (§4)
  9* Integration with Readme model (§5)
 10* Error handling (§6)
 11* Examples and non-goals (§7-8)
 12"""
 13
 14from typing import Any, Optional, Set
 15import yaml
 16from ..models.readme_extract import ReadmeExtract
 17from ..config.settings import Settings
 18
 19
[docs] 20def parse_readme(content: str, mode: str = "full") -> dict[str, Any]: 21 """Parse README content and extract structured information. 22 23 Args: 24 content: The README content to parse 25 mode: Parsing mode ("full", "todo", etc.) 26 27 Returns: 28 Dictionary containing parsed information with keys: 29 - front_matter: Extracted YAML frontmatter 30 - content: Main content without frontmatter 31 """ 32 if not content: 33 return {"front_matter": {}, "content": ""} 34 35 # Extract frontmatter 36 front_matter = _extract_frontmatter(content) 37 38 # Extract content (everything after frontmatter) 39 content_lines = content.split("\n") 40 content_start = 0 41 42 if content.startswith("---"): 43 # Find end of frontmatter 44 for i, line in enumerate(content_lines[1:], 1): 45 if line.strip() == "---": 46 content_start = i + 1 47 break 48 49 main_content = "\n".join(content_lines[content_start:]).strip() 50 51 result = {"front_matter": front_matter, "content": main_content} 52 53 # Add mode-specific parsing 54 if mode == "todo": 55 settings = Settings.current() 56 todo_sections = extract_todo_sections(content, settings.todo_keywords) 57 if todo_sections: 58 result["todo"] = todo_sections 59 60 return result
61 62
[docs] 63def extract_readme_data( 64 content: str, todo_keywords: Optional[Set[str]] = None 65) -> ReadmeExtract: 66 """Extract interpreted information from README content and return a ReadmeExtract object. 67 68 Only authors, supervisors and raw front-matter are returned per the strict 69 specification. Any additional metadata is ignored at this stage. 70 """ 71 if todo_keywords is None: 72 # Use default todo keywords from settings 73 settings = Settings.current() 74 todo_keywords = set(settings.todo_keywords) 75 76 # Parse frontmatter 77 frontmatter = _extract_frontmatter(content) 78 79 # Process authors and supervisors 80 authors, supervisors = _process_authors( 81 frontmatter.get("authors") or frontmatter.get("author") 82 ) 83 84 return ReadmeExtract( 85 authors=authors, 86 supervisors=supervisors, 87 raw_frontmatter=frontmatter, 88 )
89 90 91def _extract_frontmatter(content: str) -> dict[str, Any]: 92 if not content.startswith("---"): 93 return {} 94 try: 95 lines = content.split("\n") 96 if len(lines) < 2: 97 return {} 98 end_index = -1 99 for i, line in enumerate(lines[1:], 1): 100 if line.strip() == "---": 101 end_index = i 102 break 103 if end_index == -1: 104 return {} 105 frontmatter_lines = lines[1:end_index] 106 frontmatter_text = "\n".join(frontmatter_lines) 107 return yaml.safe_load(frontmatter_text) or {} 108 except Exception: 109 return {} 110 111
[docs] 112def extract_first_paragraph(content: str) -> str | None: 113 lines = content.split("\n") 114 start_index = 0 115 if content.startswith("---"): 116 for i, line in enumerate(lines[1:], 1): 117 if line.strip() == "---": 118 start_index = i + 1 119 break 120 121 # Find the first heading and paragraph 122 first_heading = None 123 current_paragraph = [] 124 in_paragraph = False 125 in_code_block = False 126 127 for line in lines[start_index:]: 128 stripped = line.strip() 129 130 # Handle code blocks 131 if stripped.startswith("```"): 132 if in_code_block: 133 # End of code block 134 current_paragraph.append(line) 135 in_code_block = False 136 else: 137 # Start of code block 138 if in_paragraph: 139 current_paragraph.append(line) 140 in_code_block = True 141 continue 142 143 if in_code_block: 144 # Inside code block, always include 145 current_paragraph.append(line) 146 continue 147 148 # Skip empty lines 149 if not stripped: 150 if in_paragraph: 151 # End of paragraph 152 break 153 continue 154 155 # Handle headings 156 if stripped.startswith("#"): 157 if first_heading is None: 158 # First heading - include it 159 first_heading = line 160 continue 161 elif in_paragraph: 162 # Second heading - end of paragraph 163 break 164 else: 165 # Skip subsequent headings before paragraph starts 166 continue 167 168 # Start or continue paragraph 169 in_paragraph = True 170 current_paragraph.append(line) 171 172 # Combine heading and paragraph 173 result_parts = [] 174 if first_heading: 175 result_parts.append(first_heading) 176 if current_paragraph: 177 result_parts.extend(current_paragraph) 178 179 if result_parts: 180 # Preserve original formatting: only insert newline that already exists 181 if first_heading and current_paragraph: 182 # Preserve blank line only if it existed in original text 183 double_newline = f"{first_heading}\n\n" in content 184 sep = "\n\n" if double_newline else "\n" 185 return f"{first_heading}{sep}" + "\n".join(current_paragraph).strip() 186 return "\n".join(result_parts).strip() 187 188 return None
189 190
[docs] 191def extract_todo_sections(content: str, todo_keywords: list[str]) -> str | None: 192 """Extract TODO sections from README content based on keywords. 193 194 Args: 195 content: The README content to parse 196 todo_keywords: List of keywords to search for (e.g., ["todo", "status", "stand"]) 197 198 Returns: 199 Extracted TODO sections as markdown text, or None if no sections found 200 """ 201 if not content or not todo_keywords: 202 return None 203 204 lines = content.split("\n") 205 todo_sections = [] 206 current_section = [] 207 in_todo_section = False 208 209 # Skip frontmatter if present 210 start_index = 0 211 if content.startswith("---"): 212 for i, line in enumerate(lines[1:], 1): 213 if line.strip() == "---": 214 start_index = i + 1 215 break 216 217 for line in lines[start_index:]: 218 stripped = line.strip().lower() 219 220 # Check if this line contains any todo keywords 221 contains_keyword = any(keyword.lower() in stripped for keyword in todo_keywords) 222 223 # Check for section headers that might contain todo keywords 224 is_header = stripped.startswith("#") and contains_keyword 225 226 if is_header or contains_keyword: 227 # Start or continue a todo section 228 if not in_todo_section: 229 in_todo_section = True 230 current_section = [] 231 232 current_section.append(line) 233 elif in_todo_section: 234 # Check if we should end the section 235 if stripped.startswith("#") and not contains_keyword: 236 # New header without todo keywords - end current section 237 if current_section: 238 todo_sections.append("\n".join(current_section)) 239 current_section = [] 240 in_todo_section = False 241 elif not stripped: 242 # Empty line - continue section but mark potential end 243 current_section.append(line) 244 else: 245 # Continue current section 246 current_section.append(line) 247 248 # Add final section if still in progress 249 if in_todo_section and current_section: 250 todo_sections.append("\n".join(current_section)) 251 252 if todo_sections: 253 return "\n\n".join(todo_sections) 254 255 return None
256 257 258def _process_authors(authors_data: Any) -> tuple[list[str], list[str]]: 259 if not authors_data: 260 return [], [] 261 all_authors = [] 262 supervisors = [] 263 264 def process_author_item(item: Any) -> tuple[str | None, bool]: 265 if isinstance(item, str): 266 return item, False 267 elif isinstance(item, dict): 268 name = item.get("name") 269 if not name: 270 return None, False 271 roles = item.get("roles", []) 272 if isinstance(roles, str): 273 roles = [roles] 274 elif not isinstance(roles, list): 275 roles = [] 276 is_supervisor = "Supervision" in roles 277 return name, is_supervisor 278 else: 279 return None, False 280 281 if isinstance(authors_data, str): 282 name, is_supervisor = process_author_item(authors_data) 283 if name: 284 all_authors.append(name) 285 if is_supervisor: 286 supervisors.append(name) 287 elif isinstance(authors_data, list): 288 for item in authors_data: 289 name, is_supervisor = process_author_item(item) 290 if name: 291 all_authors.append(name) 292 if is_supervisor: 293 supervisors.append(name) 294 elif isinstance(authors_data, dict): 295 name, is_supervisor = process_author_item(authors_data) 296 if name: 297 all_authors.append(name) 298 if is_supervisor: 299 supervisors.append(name) 300 return all_authors, supervisors