You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
160 lines
5.6 KiB
160 lines
5.6 KiB
import os
|
|
import re
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import markdown
|
|
|
|
|
|
def parse_for_toc(content):
|
|
# TODO: ADD .TableofContent directly to the page (see hugo theme ?)
|
|
content = content.replace("[TOC]", "")
|
|
return content
|
|
|
|
|
|
def parse_for_notice(content):
|
|
result = re.findall(":::warning.*?:::", content, re.MULTILINE | re.DOTALL)
|
|
for notice in result:
|
|
old = notice
|
|
content = content.replace(
|
|
old,
|
|
notice.replace(":::warning", "{{% notice info %}}").replace(
|
|
":::", "{{% /notice %}}"
|
|
),
|
|
)
|
|
result = re.findall(":::info.*?:::", content, re.MULTILINE | re.DOTALL)
|
|
for notice in result:
|
|
old = notice
|
|
content = content.replace(
|
|
old,
|
|
notice.replace(":::info", "{{% notice note %}}").replace(
|
|
":::", "{{% /notice %}}"
|
|
),
|
|
)
|
|
result = re.findall(":::success.*?:::", content, re.MULTILINE | re.DOTALL)
|
|
for notice in result:
|
|
old = notice
|
|
content = content.replace(
|
|
old,
|
|
notice.replace(":::success", "{{% notice tip %}}").replace(
|
|
":::", "{{% /notice %}}"
|
|
),
|
|
)
|
|
result = re.findall(":::danger.*?:::", content, re.MULTILINE | re.DOTALL)
|
|
for notice in result:
|
|
old = notice
|
|
content = content.replace(
|
|
old,
|
|
notice.replace(":::danger", "{{% notice warning %}}").replace(
|
|
":::", "{{% /notice %}}"
|
|
),
|
|
)
|
|
# print(content)
|
|
return content
|
|
|
|
|
|
def parse_markdown_file(markdown_file_path, base_dir="./"):
|
|
with open(markdown_file_path, "r") as f:
|
|
markdown_text = f.read()
|
|
|
|
html = markdown.markdown(markdown_text, extensions=["fenced_code"])
|
|
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
|
|
if not os.path.exists(base_dir):
|
|
os.makedirs(base_dir)
|
|
|
|
current_heading_level = 1
|
|
current_heading_dir = base_dir
|
|
last_heading_dir = base_dir
|
|
|
|
for element in soup.children:
|
|
if element.name in ["h1", "h2", "h3"]:
|
|
# Get the text of the heading and the heading level
|
|
heading_text = element.text.strip()
|
|
heading_level = int(element.name[1])
|
|
|
|
# Determine the directory to create for the heading
|
|
print(
|
|
"heading_level: %s(%s) , heading_text: %s, base_dir: %s, last_heading_dir: %s, current_heading_dir: %s "
|
|
% (
|
|
heading_level,
|
|
current_heading_level,
|
|
heading_text,
|
|
base_dir,
|
|
last_heading_dir,
|
|
current_heading_dir,
|
|
)
|
|
)
|
|
if heading_level == 1:
|
|
heading_dir = os.path.join(base_dir, heading_text)
|
|
current_heading_dir = heading_dir
|
|
last_heading_dir = base_dir
|
|
elif heading_level == current_heading_level:
|
|
heading_dir = os.path.join(
|
|
os.path.dirname(current_heading_dir), heading_text
|
|
)
|
|
last_heading_dir = heading_dir
|
|
current_heading_dir = heading_dir
|
|
elif heading_level >= current_heading_level:
|
|
heading_dir = os.path.join(current_heading_dir, heading_text)
|
|
last_heading_dir = current_heading_dir
|
|
current_heading_dir = heading_dir
|
|
else:
|
|
print("NOT SUPPORTED YET")
|
|
|
|
if not os.path.exists(heading_dir):
|
|
os.makedirs(heading_dir)
|
|
# add _index.md for page organization
|
|
with open(heading_dir + "/_index.md", "wb") as f:
|
|
print("Created", heading_dir + "/_index.md")
|
|
index = """
|
|
+++
|
|
title = "{heading_text}"
|
|
chapter = true
|
|
weight = 5
|
|
+++
|
|
|
|
# {heading_text}
|
|
|
|
Discover what this Hugo theme is all about and the core-concepts behind it.
|
|
""".format(
|
|
heading_text=heading_text
|
|
)
|
|
f.write(b"%s" % index.encode("utf-8"))
|
|
|
|
# Set the current heading level and directory
|
|
current_heading_level = heading_level
|
|
# last_heading_dir = current_heading_dir
|
|
# current_heading_dir = heading_dir
|
|
|
|
elif element.name == "hr":
|
|
current_heading_level = 0
|
|
current_heading_dir = base_dir
|
|
|
|
elif element.name == "ul" and current_heading_level != 0:
|
|
# Get the links in the list
|
|
links = element.find_all("a")
|
|
for link in links:
|
|
# Get the text and href of the link
|
|
link_text = link.text.strip()
|
|
link_url = link["href"]
|
|
|
|
file_path = os.path.join(
|
|
current_heading_dir, os.path.basename(link_text)
|
|
)
|
|
if link_url:
|
|
with open(file_path + ".md", "wb") as f:
|
|
doc_link = link_url + "/download"
|
|
response = requests.get(doc_link)
|
|
hugo_header = '---\ntitle: "' + link_text + '"\n---\n\n'
|
|
content = parse_for_notice(
|
|
response.content.decode("utf-8")
|
|
).encode("utf-8")
|
|
content = parse_for_toc(content.decode("utf-8")).encode("utf-8")
|
|
f.write(hugo_header.encode("utf-8"))
|
|
f.write(content.replace(b"---", b""))
|
|
print("Downloaded", doc_link, "to", file_path)
|
|
|
|
|
|
headings = parse_markdown_file("content/my-first-post.md", base_dir="./content")
|
|
print(headings)
|