hurl/bin/docs/build_standalone_md.py

#!/usr/bin/env python3
"""
Build a standalone Markdown file of all the documentation. All links and anchors are rewritten so the
links are functional: during the concatenation of two files, the script insures that an anchor is well
specific to a given pages. "The essential, it works": means that while this script is working, it may be
not easy to maintain it.

Examples:
    $ python3 bin/docs/build_standalone_md.py > docs/standalone/hurl-5.0.1.md
"""

import os
import re
import sys
import unicodedata
from datetime import datetime
from pathlib import Path

import markdown
import tomllib
from markdown import Header, MarkdownDoc, Paragraph, RefLink, Table, Whitespace


def add_section_header(doc: MarkdownDoc, title: str):
    """Add a section header h1 to a Markdown document, with a given title"""
    node = Header(title=title, level=1)
    add_header_id(header=node, prefix=None)
    doc.add_child(node)
    node = Whitespace(content="\n")
    doc.add_child(node)


def add_sections(doc: MarkdownDoc, title: str | None, files: [str]):
    """Add a new section to a markdown documentation, using a list of files to concatenate"""
    if title:
        add_section_header(doc=doc, title=title)

    for file in files:
        sys.stderr.write(f">>> Processing <{file}>...\n")
        path = Path(file)
        text = path.read_text()
        file_md = markdown.parse_markdown(text=text)
        file_md.indent()

        # All ref links (https://daringfireball.net/projects/markdown/syntax) are inlined so we can concatenate
        # multiple documents without any problem
        #
        # Before:
        # ```markdown
        # Some bla bal [a reference][ref]
        # [ref]: https://foo.com
        # ```
        #
        # After:
        # ```markdown
        # Some bla bal [a reference](https://foo.com)
        # ```
        inline_ref_link(md=file_md)

        # Anchors are normalize so we can concatenate multiple documents that have the same anchors
        #
        # Before:
        # ```markdown
        # Some bla bal [a reference](#anchor)
        # ```
        #
        # After:
        # ```markdown
        # Some bla bal [a reference](#name-of-the-document-anchor)

        anchors_prefix = f"{title} {path.stem}"
        anchors_prefix = slugify(anchors_prefix)
        rewrite_links(md=file_md, prefix=anchors_prefix)

        hr = Paragraph(content="\n\n<hr>\n\n")
        file_md.add_child(hr)

        doc.extend(file_md)


def add_header_id(header: Header, prefix: str | None):
    """Add an anchor id to a header
    Example: `# Some title` => `# Some title {#a-prefix-some-title}`
    """
    slug = slugify(header.title)
    if prefix:
        _id = f"{prefix}-{slug}"
    else:
        _id = slug
    header.id = _id
    header.update_content()


def slugify(text: str) -> str:
    """Makes a slug from a text."""
    text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode("ascii")
    text = re.sub(r"[^\w\s/-]", "", text).strip().lower()
    return re.sub(r"[-\s]+", "-", text).replace("/", "")


def section_from_page(page: str):
    """Returns the section title from a page ex: "manual.md" => "Getting Started" """
    if page in ["home.md"]:
        return "Introduction"
    elif page in ["license.md"]:
        return "Resources"
    elif page in [
        "installation.md",
        "manual.md",
        "sample.md",
        "running-tests.md",
        "frequently-asked-questions.md",
    ]:
        return "Getting Started"
    else:
        return "File Format"


def rewrite_links(md: MarkdownDoc, prefix: str):
    """When multiple Markdown documents are concatenate, we need to rewrite links and anchor because
    some anchors can overlapped and documents are merged into a single document."""
    # Find all headers and add an id specific to the page
    # `# Some title` => `# Some title {#some-title}`
    headers = [c for c in md.children if isinstance(c, Header)]
    for header in headers:
        add_header_id(header, prefix=prefix)

    # Replace `[Foo](#anchor)` => `[Foo](#current-page-anchor)`
    nodes = [c for c in md.children if isinstance(c, Paragraph) or isinstance(c, Table)]
    for node in nodes:

        def repl(match_obj):
            title = match_obj.group("title")
            anchor = match_obj.group("anchor")
            _id = f"#{prefix}-{anchor}"
            return f"[{title}]({_id})"

        node.content = re.sub(
            r"\[(?P<title>.+?)]\(#(?P<anchor>.+?)\)", repl, node.content
        )

    # Replace `[Foo](/docs/some-page.md#anchor)` => `[Foo](#some-page-anchor)`
    nodes = [c for c in md.children if isinstance(c, Paragraph) or isinstance(c, Table)]
    for node in nodes:

        def repl(match_obj):
            old = match_obj.group(0)
            title = match_obj.group("title")
            page = match_obj.group("page")
            section = section_from_page(page)
            section = slugify(section)
            page = page[:-3]  # Remove .md extension
            anchor = match_obj.group("anchor")
            if anchor:
                _id = f"#{section}-{page}-{anchor}"
            else:
                _id = f"#{section}-{page}"
            new = f"[{title}]({_id})"
            sys.stderr.write(f"Replace `{old}` to `{new}\n")
            return new

        node.content = re.sub(
            r"\[(?P<title>.+?)]\(/docs/(?P<page>[a-zA-Z0-9-/]+?\.md)#?(?P<anchor>[a-z0-9-]+?)?\)",
            repl,
            node.content,
        )

    # Replace Manual links
    # `<a href="#aws-sigv4" id="aws-sigv4">`
    tables = [c for c in md.children if isinstance(c, Table)]
    for table in tables:

        def repl(match_obj):
            href = match_obj.group("href")
            _id = match_obj.group("_id")
            if href != _id:
                return f'<a href="{href}" id="{_id}">'
            else:
                return f'<a href="#{prefix}-{href}" id="{prefix}-{_id}">'

        table.content = re.sub(
            r"<a href=\"#(?P<href>.+?)\" id=\"(?P<_id>.+?)\">", repl, table.content
        )
        table.reformat()


def inline_ref_link(md: MarkdownDoc):
    """Ref links are inline: as documents are merged, we do not want to have ref links in the
    middle of the final document."""
    # Find all ref link:
    p_nodes = [c for c in md.children if isinstance(c, Paragraph)]
    ref_nodes = [c for c in md.children if isinstance(c, RefLink)]

    # Inline ref links
    for p in p_nodes:

        def repl(match_obj):
            ref = match_obj.group("ref")
            ref_links = (n for n in ref_nodes if n.ref == ref)
            ref_link = next(ref_links, None)
            if not ref_link:
                sys.stderr.write(f"No ref for [{ref}]\n")
                return f"[{ref}]"
            url = ref_link.link.strip()
            new = f"[{ref}]({url})"
            sys.stderr.write(f"Inline `[{ref}]` to `{new}`\n")
            return new

        p.content = re.sub(r"\[(?P<ref>.+?)]", repl, p.content)

    # Delete ref links
    md.remove_nodes(ref_nodes)


def main() -> int:
    # Identify version
    with open("packages/hurl/Cargo.toml", "rb") as f:
        data = tomllib.load(f)
    version = data["package"]["version"]
    version = version.replace("-SNAPSHOT", "")
    sys.stderr.write(f"version:{version}\n")

    standalone_md = MarkdownDoc()

    add_sections(
        doc=standalone_md,
        title="Introduction",
        files=[
            "docs/home.md",
        ],
    )

    add_sections(
        doc=standalone_md,
        title="Getting Started",
        files=[
            "docs/installation.md",
            "docs/manual.md",
            "docs/samples.md",
            "docs/running-tests.md",
            "docs/frequently-asked-questions.md",
        ],
    )

    add_sections(
        doc=standalone_md,
        title="File Format",
        files=[
            "docs/hurl-file.md",
            "docs/entry.md",
            "docs/request.md",
            "docs/response.md",
            "docs/capturing-response.md",
            "docs/asserting-response.md",
            "docs/filters.md",
            "docs/templates.md",
            "docs/grammar.md",
        ],
    )

    add_sections(
        doc=standalone_md,
        title="Resources",
        files=[
            "docs/license.md",
        ],
    )

    # Make the cover
    toc_txt = standalone_md.toc()
    toc = Paragraph(content=toc_txt)
    standalone_md.children.insert(0, toc)

    title = Header(title="Hurl Documentation", level=1)
    standalone_md.children.insert(0, title)
    ws = Whitespace(content="\n")
    standalone_md.children.insert(1, ws)
    date = datetime.today().strftime("%d-%m-%Y")
    title = Header(title=f"Version {version} - {date}", level=2)
    standalone_md.children.insert(2, title)
    ws = Whitespace(content="\n")
    standalone_md.children.insert(3, ws)

    standalone = standalone_md.to_text()
    standalone = rewrite_content(text=standalone, version=version)

    print(standalone)
    return os.EX_OK


def rewrite_content(text: str, version: str) -> str:
    """Some hardcoded replacement."""
    return (
        text.replace("/docs/assets/img/", "https://hurl.dev/assets/img/")
        .replace('<div id="home-demo"></div>', "")
        .replace("[Blog](blog.md)", "[Blog](https://hurl.dev/blog)")
        .replace(
            "[Tutorial](#file-format-tutorial/your-first-hurl-file)",
            "[Tutorial](https://hurl.dev/docs/tutorial/your-first-hurl-file.html)",
        )
        .replace(
            "[Documentation](#getting-started-installation)",
            "[Documentation](https://hurl.dev)",
        )
        .replace(
            f" (download [HTML](/docs/standalone/hurl-{version}.html), [PDF](/docs/standalone/hurl-{version}.pdf), [Markdown](/docs/standalone/hurl-{version}.md))",
            "",
        )
        .replace("/docs/asserting-response.html#", "#file-format-asserting-response-")
        .replace(
            '<a href="/docs/capturing-response.html">',
            '<a href="#file-format-capturing-response-capturing-response">',
        )
        .replace(
            '<a href="#method">Method</a>',
            '<a href="#file-format-request-method">Method</a>',
        )
        .replace('<a href="#url">URL</a>', '<a href="#file-format-request-url">URL</a>')
        .replace(
            '<a href="#headers">HTTP request headers</a>',
            '<a href="#file-format-request-headers">HTTP request headers</a>',
        )
        .replace(
            '<a href="#options">Options</a>',
            '<a href="#file-format-options">Options</a>',
        )
        .replace(
            '<a href="#query-parameters">query strings</a>',
            '<a href="#file-format-request-query-parameters">query strings</a>',
        )
        .replace(
            '<a href="#form-parameters">form params</a>',
            '<a href="#file-format-request-form-parameters">form params</a>',
        )
        .replace(
            '<a href="#cookies">cookies</a>',
            '<a href="#file-format-request-cookies">cookies</a>',
        )
        .replace(
            '<a href="#basic-authentication">authentication</a>',
            '<a href="#file-format-request-basic-authentication">authentication</a>',
        )
        .replace(
            '<a href="#body">HTTP request body</a>',
            '<a href="#file-format-request-body">HTTP request body</a>',
        )
        .replace(
            "[UUID v4 random string]",
            "[UUID v4 random string](https://en.wikipedia.org/wiki/Universally_unique_identifier)",
        )
        .replace(
            "[RFC 3339]",
            "[RFC 3339](https://www.rfc-editor.org/rfc/rfc3339)",
        )
    )


if __name__ == "__main__":
    main()