Nicolas JEUDY
2 years ago
2 changed files with 176 additions and 0 deletions
-
109generate_v2.py
-
67generate_v3.py
@ -0,0 +1,109 @@ |
|||||
|
import io |
||||
|
import os |
||||
|
from bs4 import BeautifulSoup, Comment |
||||
|
import markdown |
||||
|
import ipdb |
||||
|
|
||||
|
parent_dir = "./content" |
||||
|
|
||||
|
|
||||
|
def parse_markdown_file(file_path): |
||||
|
with open(file_path, "r") as file: |
||||
|
markdown_text = file.read() |
||||
|
html = markdown.markdown(markdown_text) |
||||
|
print(html) |
||||
|
headings = [] |
||||
|
current_level = 0 |
||||
|
for line in html.split("\n"): |
||||
|
if line.startswith("<h1>"): |
||||
|
current_level = 1 |
||||
|
headings.append( |
||||
|
{"level": current_level, "text": line[4:-5], "children": []} |
||||
|
) |
||||
|
elif line.startswith("<h2>"): |
||||
|
if current_level < 2: |
||||
|
current_level = 2 |
||||
|
headings[-1]["children"].append( |
||||
|
{"level": current_level, "text": line[4:-5], "children": []} |
||||
|
) |
||||
|
else: |
||||
|
headings[-1]["children"].append( |
||||
|
{"level": current_level, "text": line[4:-5], "children": []} |
||||
|
) |
||||
|
elif line.startswith("<h3>"): |
||||
|
if current_level < 3: |
||||
|
current_level = 3 |
||||
|
headings[-1]["children"][-1]["children"].append( |
||||
|
{"level": current_level, "text": line[4:-5], "children": []} |
||||
|
) |
||||
|
else: |
||||
|
headings[-1]["children"][-1]["children"].append( |
||||
|
{"level": current_level, "text": line[4:-5], "children": []} |
||||
|
) |
||||
|
return headings |
||||
|
|
||||
|
|
||||
|
def parse_markdown_file_2(file_path): |
||||
|
with open(file_path, "r", encoding="utf-8") as f: |
||||
|
content = f.read() |
||||
|
html = markdown.markdown(content) |
||||
|
print(html) |
||||
|
soup = BeautifulSoup(html, "html.parser") |
||||
|
headings = [] |
||||
|
|
||||
|
def parse_element(element, level): |
||||
|
print(element) |
||||
|
if element.name == "h1": |
||||
|
heading = { |
||||
|
"text": element.text.strip(), |
||||
|
"level": level, |
||||
|
"subheadings": [], |
||||
|
"links": [], |
||||
|
} |
||||
|
headings.append(heading) |
||||
|
elif element.name == "h2": |
||||
|
subheading = { |
||||
|
"text": element.text.strip(), |
||||
|
"level": level, |
||||
|
"subheadings": [], |
||||
|
"links": [], |
||||
|
} |
||||
|
headings[-1]["subheadings"].append(subheading) |
||||
|
elif element.name == "h3": |
||||
|
subsubheading = {"text": element.text.strip(), "level": level, "links": []} |
||||
|
headings[-1]["subheadings"][-1]["subheadings"].append(subsubheading) |
||||
|
elif element.name == "ul": |
||||
|
links = [] |
||||
|
for li in element.find_all("li"): |
||||
|
link = li.find("a") |
||||
|
if link is not None: |
||||
|
links.append({"text": link.text.strip(), "url": link["href"]}) |
||||
|
if level == 1: |
||||
|
headings[-1]["links"].extend(links) |
||||
|
elif level == 2: |
||||
|
headings[-1]["subheadings"][-1]["links"].extend(links) |
||||
|
elif level == 3: |
||||
|
headings[-1]["subheadings"][-1]["subheadings"][-1]["links"].extend( |
||||
|
links |
||||
|
) |
||||
|
|
||||
|
for child in element.children: |
||||
|
if isinstance(child, str) or isinstance(child, Comment): |
||||
|
continue |
||||
|
parse_element(child, level + 1) |
||||
|
|
||||
|
parse_element(soup, 0) |
||||
|
|
||||
|
return headings |
||||
|
|
||||
|
|
||||
|
headings = parse_markdown_file_2("content/posts/my-first-post.md") |
||||
|
print(headings) |
||||
|
for heading in headings: |
||||
|
print(f"Titre de niveau {heading['level']}: {heading['text']}") |
||||
|
for subheading in heading["children"]: |
||||
|
print(f" Sous-titre de niveau {subheading['level']}: {subheading['text']}") |
||||
|
for subsubheading in subheading["children"]: |
||||
|
print( |
||||
|
f" Sous-sous-titre de niveau {subsubheading['level']}: {subsubheading['text']}" |
||||
|
) |
@ -0,0 +1,67 @@ |
|||||
|
import os |
||||
|
import requests |
||||
|
from bs4 import BeautifulSoup |
||||
|
import markdown |
||||
|
|
||||
|
|
||||
|
def parse_markdown_file(markdown_file_path, base_dir="./"): |
||||
|
with open(markdown_file_path, "r") as f: |
||||
|
markdown_text = f.read() |
||||
|
|
||||
|
html = markdown.markdown(markdown_text, extensions=["fenced_code"]) |
||||
|
|
||||
|
soup = BeautifulSoup(html, "html.parser") |
||||
|
|
||||
|
if not os.path.exists(base_dir): |
||||
|
os.makedirs(base_dir) |
||||
|
|
||||
|
current_heading_level = 1 |
||||
|
current_heading_dir = base_dir |
||||
|
last_heading_dir = base_dir |
||||
|
|
||||
|
for element in soup.children: |
||||
|
if element.name in ["h1", "h2", "h3"]: |
||||
|
# Get the text of the heading and the heading level |
||||
|
heading_text = element.text.strip() |
||||
|
heading_level = int(element.name[1]) |
||||
|
|
||||
|
# Determine the directory to create for the heading |
||||
|
if heading_level == current_heading_level: |
||||
|
heading_dir = os.path.join(last_heading_dir, heading_text) |
||||
|
else: |
||||
|
heading_dir = os.path.join(current_heading_dir, heading_text) |
||||
|
|
||||
|
if not os.path.exists(heading_dir): |
||||
|
os.makedirs(heading_dir) |
||||
|
|
||||
|
# Set the current heading level and directory |
||||
|
current_heading_level = heading_level |
||||
|
last_heading_dir = current_heading_dir |
||||
|
current_heading_dir = heading_dir |
||||
|
|
||||
|
elif element.name == "hr": |
||||
|
current_heading_level = 0 |
||||
|
current_heading_dir = base_dir |
||||
|
|
||||
|
elif element.name == "ul" and current_heading_level != 0: |
||||
|
# Get the links in the list |
||||
|
links = element.find_all("a") |
||||
|
for link in links: |
||||
|
# Get the text and href of the link |
||||
|
link_text = link.text.strip() |
||||
|
link_url = link["href"] |
||||
|
|
||||
|
file_path = os.path.join( |
||||
|
current_heading_dir, os.path.basename(link_text) |
||||
|
) |
||||
|
if link_url: |
||||
|
with open(file_path, "wb") as f: |
||||
|
response = requests.get(link_url[:-1] + "/download") |
||||
|
f.write(response.content) |
||||
|
print("Downloaded", link_url, "to", file_path) |
||||
|
|
||||
|
|
||||
|
headings = parse_markdown_file( |
||||
|
"content/posts/my-first-post.md", base_dir="./content/posts" |
||||
|
) |
||||
|
print(headings) |
Write
Preview
Loading…
Cancel
Save
Reference in new issue