hugo-from-hedgedoc/generate_v3.py

import os
import re
import requests
from bs4 import BeautifulSoup
import markdown


def parse_for_toc(content):
    # TODO: ADD .TableofContent directly to the page (see hugo theme ?)
    content = content.replace("[TOC]", "")
    return content


def parse_for_notice(content):
    result = re.findall(":::warning.*?:::", content, re.MULTILINE | re.DOTALL)
    for notice in result:
        old = notice
        content = content.replace(
            old,
            notice.replace(":::warning", "{{% notice info %}}").replace(
                ":::", "{{% /notice %}}"
            ),
        )
    result = re.findall(":::info.*?:::", content, re.MULTILINE | re.DOTALL)
    for notice in result:
        old = notice
        content = content.replace(
            old,
            notice.replace(":::info", "{{% notice note %}}").replace(
                ":::", "{{% /notice %}}"
            ),
        )
    result = re.findall(":::success.*?:::", content, re.MULTILINE | re.DOTALL)
    for notice in result:
        old = notice
        content = content.replace(
            old,
            notice.replace(":::success", "{{% notice tip %}}").replace(
                ":::", "{{% /notice %}}"
            ),
        )
    result = re.findall(":::danger.*?:::", content, re.MULTILINE | re.DOTALL)
    for notice in result:
        old = notice
        content = content.replace(
            old,
            notice.replace(":::danger", "{{% notice warning %}}").replace(
                ":::", "{{% /notice %}}"
            ),
        )
    # print(content)
    return content


def parse_markdown_file(markdown_file_path, base_dir="./"):
    with open(markdown_file_path, "r") as f:
        markdown_text = f.read()

    html = markdown.markdown(markdown_text, extensions=["fenced_code"])

    soup = BeautifulSoup(html, "html.parser")

    if not os.path.exists(base_dir):
        os.makedirs(base_dir)

    current_heading_level = 1
    current_heading_dir = base_dir
    last_heading_dir = base_dir

    for element in soup.children:
        if element.name in ["h1", "h2", "h3"]:
            # Get the text of the heading and the heading level
            heading_text = element.text.strip()
            heading_level = int(element.name[1])

            # Determine the directory to create for the heading
            print(
                "heading_level: %s(%s) , heading_text: %s, base_dir: %s, last_heading_dir: %s, current_heading_dir: %s "
                % (
                    heading_level,
                    current_heading_level,
                    heading_text,
                    base_dir,
                    last_heading_dir,
                    current_heading_dir,
                )
            )
            if heading_level == 1:
                heading_dir = os.path.join(base_dir, heading_text)
                current_heading_dir = heading_dir
                last_heading_dir = base_dir
            elif heading_level == current_heading_level:
                heading_dir = os.path.join(
                    os.path.dirname(current_heading_dir), heading_text
                )
                last_heading_dir = heading_dir
                current_heading_dir = heading_dir
            elif heading_level >= current_heading_level:
                heading_dir = os.path.join(current_heading_dir, heading_text)
                last_heading_dir = current_heading_dir
                current_heading_dir = heading_dir
            else:
                print("NOT SUPPORTED YET")

            if not os.path.exists(heading_dir):
                os.makedirs(heading_dir)
                # add _index.md for page organization
                with open(heading_dir + "/_index.md", "wb") as f:
                    print("Created", heading_dir + "/_index.md")
                    index = """
+++
title = "{heading_text}"
chapter = true
weight = 5
+++

# {heading_text}

Discover what this Hugo theme is all about and the core-concepts behind it.
                        """.format(
                        heading_text=heading_text
                    )
                    f.write(b"%s" % index.encode("utf-8"))

            # Set the current heading level and directory
            current_heading_level = heading_level
            # last_heading_dir = current_heading_dir
            # current_heading_dir = heading_dir

        elif element.name == "hr":
            current_heading_level = 0
            current_heading_dir = base_dir

        elif element.name == "ul" and current_heading_level != 0:
            # Get the links in the list
            links = element.find_all("a")
            for link in links:
                # Get the text and href of the link
                link_text = link.text.strip()
                link_url = link["href"]

                file_path = os.path.join(
                    current_heading_dir, os.path.basename(link_text)
                )
                if link_url:
                    with open(file_path + ".md", "wb") as f:
                        doc_link = link_url + "/download"
                        response = requests.get(doc_link)
                        hugo_header = '---\ntitle: "' + link_text + '"\n---\n\n'
                        content = parse_for_notice(
                            response.content.decode("utf-8")
                        ).encode("utf-8")
                        content = parse_for_toc(content.decode("utf-8")).encode("utf-8")
                        f.write(hugo_header.encode("utf-8"))
                        f.write(content.replace(b"---", b""))
                        print("Downloaded", doc_link, "to", file_path)


headings = parse_markdown_file("content/my-first-post.md", base_dir="./content")
print(headings)