hurl/bin/docs/build_standalone_md.py

#!/usr/bin/env python3
"""
Build a standalone Markdown file of all the documentation. All links and anchors are rewritten so the
links are functional: during the concatenation of two files, the script insures that an anchor is well
specific to a given pages. "The essential, it works": means that while this script is working, it may be
not easy to maintain it.

Examples:
    $ python3 bin/docs/build_standalone_md.py > docs/standalone/hurl-5.0.1.md
"""

import os
import re
import sys
import unicodedata
from pathlib import Path

import markdown
from markdown import Header, MarkdownDoc, Paragraph, RefLink, Table, Whitespace


def add_section_header(doc: MarkdownDoc, title: str):
    """Add a section header h1 to a Markdown document, with a given title"""
    node = Header(title=title, level=1)
    add_header_id(header=node, prefix=None)
    doc.add_child(node)
    node = Whitespace(content="\n")
    doc.add_child(node)


def add_sections(doc: MarkdownDoc, title: str | None, files: [str]):
    """Add a new section to a markdown documentation, using a list of files to concatenate"""
    if title:
        add_section_header(doc=doc, title=title)

    for file in files:
        sys.stderr.write(f">>> Processing <{file}>...\n")
        path = Path(file)
        text = path.read_text()
        file_md = markdown.parse_markdown(text=text)
        file_md.indent()

        # All ref links (https://daringfireball.net/projects/markdown/syntax) are inlined so we can concatenate
        # multiple documents without any problem
        #
        # Before:
        # ```markdown
        # Some bla bal [a reference][ref]
        # [ref]: https://foo.com
        # ```
        #
        # After:
        # ```markdown
        # Some bla bal [a reference](https://foo.com)
        # ```
        inline_ref_link(md=file_md)

        # Anchors are normalize so we can concatenate multiple documents that have the same anchors
        #
        # Before:
        # ```markdown
        # Some bla bal [a reference](#anchor)
        # ```
        #
        # After:
        # ```markdown
        # Some bla bal [a reference](#name-of-the-document-anchor)

        anchors_prefix = f"{title} {path.stem}"
        anchors_prefix = slugify(anchors_prefix)
        rewrite_links(md=file_md, prefix=anchors_prefix)

        hr = Paragraph(content="\n\n<hr>\n\n")
        file_md.add_child(hr)

        doc.extend(file_md)


def add_header_id(header: Header, prefix: str | None):
    """Add an anchor id to a header
    Example: `# Some title` => `# Some title {#a-prefix-some-title}`
    """
    slug = slugify(header.title)
    if prefix:
        _id = f"{prefix}-{slug}"
    else:
        _id = slug
    header.id = _id
    header.update_content()


def slugify(text: str) -> str:
    """Makes a slug from a text."""
    text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode("ascii")
    text = re.sub(r"[^\w\s/-]", "", text).strip().lower()
    return re.sub(r"[-\s]+", "-", text).replace("/", "")


def section_from_page(page: str):
    """Returns the section title from a page ex: "manual.md" => "Getting Started" """
    if page in ["home.md"]:
        return "Introduction"
    elif page in ["license.md"]:
        return "Resources"
    elif page in [
        "installation.md",
        "manual.md",
        "sample.md",
        "running-tests.md",
        "frequently-asked-questions.md",
    ]:
        return "Getting Started"
    else:
        return "File Format"


def rewrite_links(md: MarkdownDoc, prefix: str):
    """When multiple Markdown documents are concatenate, we need to rewrite links and anchor because
    some anchors can overlapped and documents are merged into a single document."""
    # Find all headers and add an id specific to the page
    # `# Some title` => `# Some title {#some-title}`
    headers = [c for c in md.children if isinstance(c, Header)]
    for header in headers:
        add_header_id(header, prefix=prefix)

    # Replace `[Foo](#anchor)` => `[Foo](#current-page-anchor)`
    nodes = [c for c in md.children if isinstance(c, Paragraph) or isinstance(c, Table)]
    for node in nodes:

        def repl(match_obj):
            title = match_obj.group("title")
            anchor = match_obj.group("anchor")
            _id = f"#{prefix}-{anchor}"
            return f"[{title}]({_id})"

        node.content = re.sub(
            r"\[(?P<title>.+?)]\(#(?P<anchor>.+?)\)", repl, node.content
        )

    # Replace `[Foo](/docs/some-page.md#anchor)` => `[Foo](#some-page-anchor)`
    nodes = [c for c in md.children if isinstance(c, Paragraph) or isinstance(c, Table)]
    for node in nodes:

        def repl(match_obj):
            old = match_obj.group(0)
            title = match_obj.group("title")
            page = match_obj.group("page")
            section = section_from_page(page)
            section = slugify(section)
            page = page[:-3]  # Remove .md extension
            anchor = match_obj.group("anchor")
            if anchor:
                _id = f"#{section}-{page}-{anchor}"
            else:
                _id = f"#{section}-{page}"
            new = f"[{title}]({_id})"
            sys.stderr.write(f"Replace `{old}` to `{new}\n")
            return new

        node.content = re.sub(
            r"\[(?P<title>.+?)]\(/docs/(?P<page>[a-zA-Z0-9-/]+?\.md)#?(?P<anchor>[a-z0-9-]+?)?\)",
            repl,
            node.content,
        )

    # Replace Manual links
    # `<a href="#aws-sigv4" id="aws-sigv4">`
    tables = [c for c in md.children if isinstance(c, Table)]
    for table in tables:

        def repl(match_obj):
            href = match_obj.group("href")
            _id = match_obj.group("_id")
            if href != _id:
                return f'<a href="{href}" id="{_id}">'
            else:
                return f'<a href="#{prefix}-{href}" id="{prefix}-{_id}">'

        table.content = re.sub(
            r"<a href=\"#(?P<href>.+?)\" id=\"(?P<_id>.+?)\">", repl, table.content
        )
        table.reformat()


def inline_ref_link(md: MarkdownDoc):
    """Ref links are inline: as documents are merged, we do not want to have ref links in the
    middle of the final document."""
    # Find all ref link:
    p_nodes = [c for c in md.children if isinstance(c, Paragraph)]
    ref_nodes = [c for c in md.children if isinstance(c, RefLink)]

    # Inline ref links
    for p in p_nodes:

        def repl(match_obj):
            ref = match_obj.group("ref")
            ref_links = (n for n in ref_nodes if n.ref == ref)
            ref_link = next(ref_links, None)
            if not ref_link:
                sys.stderr.write(f"No ref for [{ref}]\n")
                return f"[{ref}]"
            url = ref_link.link.strip()
            new = f"[{ref}]({url})"
            sys.stderr.write(f"Inline `[{ref}]` to `{new}`\n")
            return new

        p.content = re.sub(r"\[(?P<ref>.+?)]", repl, p.content)

    # Delete ref links
    md.remove_nodes(ref_nodes)


def main() -> int:
    standalone_md = MarkdownDoc()

    add_sections(
        doc=standalone_md,
        title="Introduction",
        files=[
            "docs/home.md",
        ],
    )

    add_sections(
        doc=standalone_md,
        title="Getting Started",
        files=[
            "docs/installation.md",
            "docs/manual.md",
            "docs/samples.md",
            "docs/running-tests.md",
            "docs/frequently-asked-questions.md",
        ],
    )

    add_sections(
        doc=standalone_md,
        title="File Format",
        files=[
            "docs/hurl-file.md",
            "docs/entry.md",
            "docs/request.md",
            "docs/response.md",
            "docs/capturing-response.md",
            "docs/asserting-response.md",
            "docs/filters.md",
            "docs/templates.md",
            "docs/grammar.md",
        ],
    )

    add_sections(
        doc=standalone_md,
        title="Resources",
        files=[
            "docs/license.md",
        ],
    )

    # Make the cover
    toc_txt = standalone_md.toc()
    toc = Paragraph(content=toc_txt)
    standalone_md.children.insert(0, toc)

    title = Header(title="Hurl Documentation", level=1)
    standalone_md.children.insert(0, title)
    ws = Whitespace(content="\n")
    standalone_md.children.insert(1, ws)
    title = Header(title="Version 5.0.1 - 18/09/2024", level=2)
    standalone_md.children.insert(2, title)
    ws = Whitespace(content="\n")
    standalone_md.children.insert(3, ws)

    standalone = standalone_md.to_text()
    standalone = rewrite_content(standalone)

    print(standalone)
    return os.EX_OK


def rewrite_content(text: str) -> str:
    """Some hardcoded replacement."""
    return (
        text.replace("/docs/assets/img/", "https://hurl.dev/assets/img/")
        .replace('<div id="home-demo"></div>', "")
        .replace("[Blog](blog.md)", "[Blog](https://hurl.dev/blog)")
        .replace(
            "[Tutorial](#file-format-tutorial/your-first-hurl-file)",
            "[Tutorial](https://hurl.dev/docs/tutorial/your-first-hurl-file.html)",
        )
        .replace(
            "[Documentation](#getting-started-installation)",
            "[Documentation](https://hurl.dev)",
        )
        .replace(
            " (download [HTML](/docs/standalone/hurl-5.0.1.html), [PDF](/docs/standalone/hurl-5.0.1.pdf), [Markdown](/docs/standalone/hurl-5.0.1.md))",
            "",
        )
        .replace("/docs/asserting-response.html#", "#file-format-asserting-response-")
        .replace(
            '<a href="/docs/capturing-response.html">',
            '<a href="#file-format-capturing-response-capturing-response">',
        )
        .replace(
            '<a href="#method">Method</a>',
            '<a href="#file-format-request-method">Method</a>',
        )
        .replace('<a href="#url">URL</a>', '<a href="#file-format-request-url">URL</a>')
        .replace(
            '<a href="#headers">HTTP request headers</a>',
            '<a href="#file-format-request-headers">HTTP request headers</a>',
        )
        .replace(
            '<a href="#query-parameters">Query strings</a>',
            '<a href="#file-format-request-query-parameters">Query strings</a>',
        )
        .replace(
            '<a href="#form-parameters">form params</a>',
            '<a href="#file-format-request-form-parameters">form params</a>',
        )
        .replace(
            '<a href="#cookies">cookies</a>',
            '<a href="#file-format-request-cookies">cookies</a>',
        )
        .replace(
            '<a href="#basic-authentication">authentication</a>',
            '<a href="#file-format-request-basic-authentication">authentication</a>',
        )
        .replace(
            '<a href="#body">HTTP request body</a>',
            '<a href="#file-format-request-body">HTTP request body</a>',
        )
    )


if __name__ == "__main__":
    main()