You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

66 lines
2.3 KiB

import os
import requests
from bs4 import BeautifulSoup
import markdown
def parse_markdown_file(markdown_file_path, base_dir="./"):
with open(markdown_file_path, "r") as f:
markdown_text = f.read()
html = markdown.markdown(markdown_text, extensions=["fenced_code"])
soup = BeautifulSoup(html, "html.parser")
if not os.path.exists(base_dir):
os.makedirs(base_dir)
current_heading_level = 1
current_heading_dir = base_dir
last_heading_dir = base_dir
for element in soup.children:
if element.name in ["h1", "h2", "h3"]:
# Get the text of the heading and the heading level
heading_text = element.text.strip()
heading_level = int(element.name[1])
# Determine the directory to create for the heading
if heading_level == current_heading_level:
heading_dir = os.path.join(last_heading_dir, heading_text)
else:
heading_dir = os.path.join(current_heading_dir, heading_text)
if not os.path.exists(heading_dir):
os.makedirs(heading_dir)
# Set the current heading level and directory
current_heading_level = heading_level
last_heading_dir = current_heading_dir
current_heading_dir = heading_dir
elif element.name == "hr":
current_heading_level = 0
current_heading_dir = base_dir
elif element.name == "ul" and current_heading_level != 0:
# Get the links in the list
links = element.find_all("a")
for link in links:
# Get the text and href of the link
link_text = link.text.strip()
link_url = link["href"]
file_path = os.path.join(
current_heading_dir, os.path.basename(link_text)
)
if link_url:
with open(file_path + ".md", "wb") as f:
doc_link = link_url + "/download"
response = requests.get(doc_link)
f.write(response.content)
print("Downloaded", doc_link, "to", file_path)
headings = parse_markdown_file("content/my-first-post.md", base_dir="./content")
print(headings)