Browse Source

[new] reflection about generator

refactor_hedgedoc_parser
Nicolas JEUDY 2 years ago
parent
commit
1c1e698924
  1. 109
      generate_v2.py
  2. 67
      generate_v3.py

109
generate_v2.py

@ -0,0 +1,109 @@
import io
import os
from bs4 import BeautifulSoup, Comment
import markdown
import ipdb
parent_dir = "./content"
def parse_markdown_file(file_path):
with open(file_path, "r") as file:
markdown_text = file.read()
html = markdown.markdown(markdown_text)
print(html)
headings = []
current_level = 0
for line in html.split("\n"):
if line.startswith("<h1>"):
current_level = 1
headings.append(
{"level": current_level, "text": line[4:-5], "children": []}
)
elif line.startswith("<h2>"):
if current_level < 2:
current_level = 2
headings[-1]["children"].append(
{"level": current_level, "text": line[4:-5], "children": []}
)
else:
headings[-1]["children"].append(
{"level": current_level, "text": line[4:-5], "children": []}
)
elif line.startswith("<h3>"):
if current_level < 3:
current_level = 3
headings[-1]["children"][-1]["children"].append(
{"level": current_level, "text": line[4:-5], "children": []}
)
else:
headings[-1]["children"][-1]["children"].append(
{"level": current_level, "text": line[4:-5], "children": []}
)
return headings
def parse_markdown_file_2(file_path):
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
html = markdown.markdown(content)
print(html)
soup = BeautifulSoup(html, "html.parser")
headings = []
def parse_element(element, level):
print(element)
if element.name == "h1":
heading = {
"text": element.text.strip(),
"level": level,
"subheadings": [],
"links": [],
}
headings.append(heading)
elif element.name == "h2":
subheading = {
"text": element.text.strip(),
"level": level,
"subheadings": [],
"links": [],
}
headings[-1]["subheadings"].append(subheading)
elif element.name == "h3":
subsubheading = {"text": element.text.strip(), "level": level, "links": []}
headings[-1]["subheadings"][-1]["subheadings"].append(subsubheading)
elif element.name == "ul":
links = []
for li in element.find_all("li"):
link = li.find("a")
if link is not None:
links.append({"text": link.text.strip(), "url": link["href"]})
if level == 1:
headings[-1]["links"].extend(links)
elif level == 2:
headings[-1]["subheadings"][-1]["links"].extend(links)
elif level == 3:
headings[-1]["subheadings"][-1]["subheadings"][-1]["links"].extend(
links
)
for child in element.children:
if isinstance(child, str) or isinstance(child, Comment):
continue
parse_element(child, level + 1)
parse_element(soup, 0)
return headings
headings = parse_markdown_file_2("content/posts/my-first-post.md")
print(headings)
for heading in headings:
print(f"Titre de niveau {heading['level']}: {heading['text']}")
for subheading in heading["children"]:
print(f" Sous-titre de niveau {subheading['level']}: {subheading['text']}")
for subsubheading in subheading["children"]:
print(
f" Sous-sous-titre de niveau {subsubheading['level']}: {subsubheading['text']}"
)

67
generate_v3.py

@ -0,0 +1,67 @@
import os
import requests
from bs4 import BeautifulSoup
import markdown
def parse_markdown_file(markdown_file_path, base_dir="./"):
with open(markdown_file_path, "r") as f:
markdown_text = f.read()
html = markdown.markdown(markdown_text, extensions=["fenced_code"])
soup = BeautifulSoup(html, "html.parser")
if not os.path.exists(base_dir):
os.makedirs(base_dir)
current_heading_level = 1
current_heading_dir = base_dir
last_heading_dir = base_dir
for element in soup.children:
if element.name in ["h1", "h2", "h3"]:
# Get the text of the heading and the heading level
heading_text = element.text.strip()
heading_level = int(element.name[1])
# Determine the directory to create for the heading
if heading_level == current_heading_level:
heading_dir = os.path.join(last_heading_dir, heading_text)
else:
heading_dir = os.path.join(current_heading_dir, heading_text)
if not os.path.exists(heading_dir):
os.makedirs(heading_dir)
# Set the current heading level and directory
current_heading_level = heading_level
last_heading_dir = current_heading_dir
current_heading_dir = heading_dir
elif element.name == "hr":
current_heading_level = 0
current_heading_dir = base_dir
elif element.name == "ul" and current_heading_level != 0:
# Get the links in the list
links = element.find_all("a")
for link in links:
# Get the text and href of the link
link_text = link.text.strip()
link_url = link["href"]
file_path = os.path.join(
current_heading_dir, os.path.basename(link_text)
)
if link_url:
with open(file_path, "wb") as f:
response = requests.get(link_url[:-1] + "/download")
f.write(response.content)
print("Downloaded", link_url, "to", file_path)
headings = parse_markdown_file(
"content/posts/my-first-post.md", base_dir="./content/posts"
)
print(headings)
Loading…
Cancel
Save