cve/docs/generate_cve_list.py

#!/usr/bin/python3
import json
import os
import re
from pathlib import Path
from typing import Dict, List

ROOT = Path(__file__).resolve().parent.parent
OUTPUT = Path(__file__).resolve().with_name("CVE_list.json")


def normalise_block(text: str) -> str:
    text = text.replace("\r\n", "\n")
    text = re.sub(r"\n{2,}", "\n", text.strip())
    lines = [line.lstrip("- ").rstrip() for line in text.split("\n")]
    return "\n".join(line for line in lines if line)


def parse_sections(content: str) -> Dict[str, str]:
    sections: Dict[str, str] = {}
    current_header: str | None = None
    buffer: List[str] = []

    for line in content.splitlines():
        header = line.strip()
        if header.startswith("### ") or header.startswith("#### "):
            if current_header is not None:
                sections[current_header] = "\n".join(buffer).strip()
            current_header = header
            buffer = []
        else:
            buffer.append(line)

    if current_header is not None:
        sections[current_header] = "\n".join(buffer).strip()

    return sections


def collect_links(block: str) -> List[str]:
    links: List[str] = []
    for raw in block.splitlines():
        entry = raw.strip()
        if not entry or "No PoCs" in entry:
            continue
        if entry.startswith("- "):
            entry = entry[2:].strip()
        if entry and entry not in links:
            links.append(entry)
    return links


def main() -> None:
    cve_entries = []
    years = [entry for entry in os.listdir(ROOT) if entry.isdigit()]
    years.sort(reverse=True)

    for year in years:
        year_dir = ROOT / year
        for filename in sorted(os.listdir(year_dir)):
            if not filename.endswith(".md"):
                continue
            with open(year_dir / filename, "r", encoding="utf-8") as handle:
                content = handle.read()

            sections = parse_sections(content)
            description = normalise_block(sections.get("### Description", ""))
            references = collect_links(sections.get("#### Reference", ""))
            github_links = collect_links(sections.get("#### Github", ""))

            poc_entries: List[str] = []
            seen = set()
            for link in references + github_links:
                if link not in seen:
                    poc_entries.append(link)
                    seen.add(link)

            cve_entries.append({
                "cve": filename.replace(".md", ""),
                "desc": description,
                "poc": poc_entries,
            })

    with open(OUTPUT, "w", encoding="utf-8") as outfile:
        json.dump(cve_entries, outfile, ensure_ascii=False)

    print("CVE list saved to CVE_list.json")


if __name__ == "__main__":
    main()