import os import re import requests from bs4 import BeautifulSoup import markdown def parse_for_toc(content): # TODO: ADD .TableofContent directly to the page (see hugo theme ?) content = content.replace("[TOC]", "") return content def parse_for_notice(content): result = re.findall(":::warning.*?:::", content, re.MULTILINE | re.DOTALL) for notice in result: old = notice content = content.replace( old, notice.replace(":::warning", "{{% notice info %}}").replace( ":::", "{{% /notice %}}" ), ) result = re.findall(":::info.*?:::", content, re.MULTILINE | re.DOTALL) for notice in result: old = notice content = content.replace( old, notice.replace(":::info", "{{% notice note %}}").replace( ":::", "{{% /notice %}}" ), ) result = re.findall(":::success.*?:::", content, re.MULTILINE | re.DOTALL) for notice in result: old = notice content = content.replace( old, notice.replace(":::success", "{{% notice tip %}}").replace( ":::", "{{% /notice %}}" ), ) result = re.findall(":::danger.*?:::", content, re.MULTILINE | re.DOTALL) for notice in result: old = notice content = content.replace( old, notice.replace(":::danger", "{{% notice warning %}}").replace( ":::", "{{% /notice %}}" ), ) # print(content) return content def parse_markdown_file(markdown_file_path, base_dir="./"): with open(markdown_file_path, "r") as f: markdown_text = f.read() html = markdown.markdown(markdown_text, extensions=["fenced_code"]) soup = BeautifulSoup(html, "html.parser") if not os.path.exists(base_dir): os.makedirs(base_dir) current_heading_level = 1 current_heading_dir = base_dir last_heading_dir = base_dir for element in soup.children: if element.name in ["h1", "h2", "h3"]: # Get the text of the heading and the heading level heading_text = element.text.strip() heading_level = int(element.name[1]) # Determine the directory to create for the heading print( "heading_level: %s(%s) , heading_text: %s, base_dir: %s, last_heading_dir: %s, current_heading_dir: %s " % ( heading_level, current_heading_level, heading_text, base_dir, last_heading_dir, current_heading_dir, ) ) if heading_level == 1: heading_dir = os.path.join(base_dir, heading_text) current_heading_dir = heading_dir last_heading_dir = base_dir elif heading_level == current_heading_level: heading_dir = os.path.join( os.path.dirname(current_heading_dir), heading_text ) last_heading_dir = heading_dir current_heading_dir = heading_dir elif heading_level >= current_heading_level: heading_dir = os.path.join(current_heading_dir, heading_text) last_heading_dir = current_heading_dir current_heading_dir = heading_dir else: print("NOT SUPPORTED YET") if not os.path.exists(heading_dir): os.makedirs(heading_dir) # add _index.md for page organization with open(heading_dir + "/_index.md", "wb") as f: print("Created", heading_dir + "/_index.md") index = """ +++ title = "{heading_text}" chapter = true weight = 5 +++ # {heading_text} Discover what this Hugo theme is all about and the core-concepts behind it. """.format( heading_text=heading_text ) f.write(b"%s" % index.encode("utf-8")) # Set the current heading level and directory current_heading_level = heading_level # last_heading_dir = current_heading_dir # current_heading_dir = heading_dir elif element.name == "hr": current_heading_level = 0 current_heading_dir = base_dir elif element.name == "ul" and current_heading_level != 0: # Get the links in the list links = element.find_all("a") for link in links: # Get the text and href of the link link_text = link.text.strip() link_url = link["href"] file_path = os.path.join( current_heading_dir, os.path.basename(link_text) ) if link_url: with open(file_path + ".md", "wb") as f: doc_link = link_url + "/download" response = requests.get(doc_link) hugo_header = '---\ntitle: "' + link_text + '"\n---\n\n' content = parse_for_notice( response.content.decode("utf-8") ).encode("utf-8") content = parse_for_toc(content.decode("utf-8")).encode("utf-8") f.write(hugo_header.encode("utf-8")) f.write(content.replace(b"---", b"")) print("Downloaded", doc_link, "to", file_path) headings = parse_markdown_file("content/my-first-post.md", base_dir="./content") print(headings)