Nicolas JEUDY
2 years ago
2 changed files with 176 additions and 0 deletions
-
109generate_v2.py
-
67generate_v3.py
@ -0,0 +1,109 @@ |
|||
import io |
|||
import os |
|||
from bs4 import BeautifulSoup, Comment |
|||
import markdown |
|||
import ipdb |
|||
|
|||
parent_dir = "./content" |
|||
|
|||
|
|||
def parse_markdown_file(file_path): |
|||
with open(file_path, "r") as file: |
|||
markdown_text = file.read() |
|||
html = markdown.markdown(markdown_text) |
|||
print(html) |
|||
headings = [] |
|||
current_level = 0 |
|||
for line in html.split("\n"): |
|||
if line.startswith("<h1>"): |
|||
current_level = 1 |
|||
headings.append( |
|||
{"level": current_level, "text": line[4:-5], "children": []} |
|||
) |
|||
elif line.startswith("<h2>"): |
|||
if current_level < 2: |
|||
current_level = 2 |
|||
headings[-1]["children"].append( |
|||
{"level": current_level, "text": line[4:-5], "children": []} |
|||
) |
|||
else: |
|||
headings[-1]["children"].append( |
|||
{"level": current_level, "text": line[4:-5], "children": []} |
|||
) |
|||
elif line.startswith("<h3>"): |
|||
if current_level < 3: |
|||
current_level = 3 |
|||
headings[-1]["children"][-1]["children"].append( |
|||
{"level": current_level, "text": line[4:-5], "children": []} |
|||
) |
|||
else: |
|||
headings[-1]["children"][-1]["children"].append( |
|||
{"level": current_level, "text": line[4:-5], "children": []} |
|||
) |
|||
return headings |
|||
|
|||
|
|||
def parse_markdown_file_2(file_path): |
|||
with open(file_path, "r", encoding="utf-8") as f: |
|||
content = f.read() |
|||
html = markdown.markdown(content) |
|||
print(html) |
|||
soup = BeautifulSoup(html, "html.parser") |
|||
headings = [] |
|||
|
|||
def parse_element(element, level): |
|||
print(element) |
|||
if element.name == "h1": |
|||
heading = { |
|||
"text": element.text.strip(), |
|||
"level": level, |
|||
"subheadings": [], |
|||
"links": [], |
|||
} |
|||
headings.append(heading) |
|||
elif element.name == "h2": |
|||
subheading = { |
|||
"text": element.text.strip(), |
|||
"level": level, |
|||
"subheadings": [], |
|||
"links": [], |
|||
} |
|||
headings[-1]["subheadings"].append(subheading) |
|||
elif element.name == "h3": |
|||
subsubheading = {"text": element.text.strip(), "level": level, "links": []} |
|||
headings[-1]["subheadings"][-1]["subheadings"].append(subsubheading) |
|||
elif element.name == "ul": |
|||
links = [] |
|||
for li in element.find_all("li"): |
|||
link = li.find("a") |
|||
if link is not None: |
|||
links.append({"text": link.text.strip(), "url": link["href"]}) |
|||
if level == 1: |
|||
headings[-1]["links"].extend(links) |
|||
elif level == 2: |
|||
headings[-1]["subheadings"][-1]["links"].extend(links) |
|||
elif level == 3: |
|||
headings[-1]["subheadings"][-1]["subheadings"][-1]["links"].extend( |
|||
links |
|||
) |
|||
|
|||
for child in element.children: |
|||
if isinstance(child, str) or isinstance(child, Comment): |
|||
continue |
|||
parse_element(child, level + 1) |
|||
|
|||
parse_element(soup, 0) |
|||
|
|||
return headings |
|||
|
|||
|
|||
headings = parse_markdown_file_2("content/posts/my-first-post.md") |
|||
print(headings) |
|||
for heading in headings: |
|||
print(f"Titre de niveau {heading['level']}: {heading['text']}") |
|||
for subheading in heading["children"]: |
|||
print(f" Sous-titre de niveau {subheading['level']}: {subheading['text']}") |
|||
for subsubheading in subheading["children"]: |
|||
print( |
|||
f" Sous-sous-titre de niveau {subsubheading['level']}: {subsubheading['text']}" |
|||
) |
@ -0,0 +1,67 @@ |
|||
import os |
|||
import requests |
|||
from bs4 import BeautifulSoup |
|||
import markdown |
|||
|
|||
|
|||
def parse_markdown_file(markdown_file_path, base_dir="./"): |
|||
with open(markdown_file_path, "r") as f: |
|||
markdown_text = f.read() |
|||
|
|||
html = markdown.markdown(markdown_text, extensions=["fenced_code"]) |
|||
|
|||
soup = BeautifulSoup(html, "html.parser") |
|||
|
|||
if not os.path.exists(base_dir): |
|||
os.makedirs(base_dir) |
|||
|
|||
current_heading_level = 1 |
|||
current_heading_dir = base_dir |
|||
last_heading_dir = base_dir |
|||
|
|||
for element in soup.children: |
|||
if element.name in ["h1", "h2", "h3"]: |
|||
# Get the text of the heading and the heading level |
|||
heading_text = element.text.strip() |
|||
heading_level = int(element.name[1]) |
|||
|
|||
# Determine the directory to create for the heading |
|||
if heading_level == current_heading_level: |
|||
heading_dir = os.path.join(last_heading_dir, heading_text) |
|||
else: |
|||
heading_dir = os.path.join(current_heading_dir, heading_text) |
|||
|
|||
if not os.path.exists(heading_dir): |
|||
os.makedirs(heading_dir) |
|||
|
|||
# Set the current heading level and directory |
|||
current_heading_level = heading_level |
|||
last_heading_dir = current_heading_dir |
|||
current_heading_dir = heading_dir |
|||
|
|||
elif element.name == "hr": |
|||
current_heading_level = 0 |
|||
current_heading_dir = base_dir |
|||
|
|||
elif element.name == "ul" and current_heading_level != 0: |
|||
# Get the links in the list |
|||
links = element.find_all("a") |
|||
for link in links: |
|||
# Get the text and href of the link |
|||
link_text = link.text.strip() |
|||
link_url = link["href"] |
|||
|
|||
file_path = os.path.join( |
|||
current_heading_dir, os.path.basename(link_text) |
|||
) |
|||
if link_url: |
|||
with open(file_path, "wb") as f: |
|||
response = requests.get(link_url[:-1] + "/download") |
|||
f.write(response.content) |
|||
print("Downloaded", link_url, "to", file_path) |
|||
|
|||
|
|||
headings = parse_markdown_file( |
|||
"content/posts/my-first-post.md", base_dir="./content/posts" |
|||
) |
|||
print(headings) |
Write
Preview
Loading…
Cancel
Save
Reference in new issue