diff --git a/generate_v2.py b/generate_v2.py new file mode 100644 index 0000000..be364b1 --- /dev/null +++ b/generate_v2.py @@ -0,0 +1,109 @@ +import io +import os +from bs4 import BeautifulSoup, Comment +import markdown +import ipdb + +parent_dir = "./content" + + +def parse_markdown_file(file_path): + with open(file_path, "r") as file: + markdown_text = file.read() + html = markdown.markdown(markdown_text) + print(html) + headings = [] + current_level = 0 + for line in html.split("\n"): + if line.startswith("

"): + current_level = 1 + headings.append( + {"level": current_level, "text": line[4:-5], "children": []} + ) + elif line.startswith("

"): + if current_level < 2: + current_level = 2 + headings[-1]["children"].append( + {"level": current_level, "text": line[4:-5], "children": []} + ) + else: + headings[-1]["children"].append( + {"level": current_level, "text": line[4:-5], "children": []} + ) + elif line.startswith("

"): + if current_level < 3: + current_level = 3 + headings[-1]["children"][-1]["children"].append( + {"level": current_level, "text": line[4:-5], "children": []} + ) + else: + headings[-1]["children"][-1]["children"].append( + {"level": current_level, "text": line[4:-5], "children": []} + ) + return headings + + +def parse_markdown_file_2(file_path): + with open(file_path, "r", encoding="utf-8") as f: + content = f.read() + html = markdown.markdown(content) + print(html) + soup = BeautifulSoup(html, "html.parser") + headings = [] + + def parse_element(element, level): + print(element) + if element.name == "h1": + heading = { + "text": element.text.strip(), + "level": level, + "subheadings": [], + "links": [], + } + headings.append(heading) + elif element.name == "h2": + subheading = { + "text": element.text.strip(), + "level": level, + "subheadings": [], + "links": [], + } + headings[-1]["subheadings"].append(subheading) + elif element.name == "h3": + subsubheading = {"text": element.text.strip(), "level": level, "links": []} + headings[-1]["subheadings"][-1]["subheadings"].append(subsubheading) + elif element.name == "ul": + links = [] + for li in element.find_all("li"): + link = li.find("a") + if link is not None: + links.append({"text": link.text.strip(), "url": link["href"]}) + if level == 1: + headings[-1]["links"].extend(links) + elif level == 2: + headings[-1]["subheadings"][-1]["links"].extend(links) + elif level == 3: + headings[-1]["subheadings"][-1]["subheadings"][-1]["links"].extend( + links + ) + + for child in element.children: + if isinstance(child, str) or isinstance(child, Comment): + continue + parse_element(child, level + 1) + + parse_element(soup, 0) + + return headings + + +headings = parse_markdown_file_2("content/posts/my-first-post.md") +print(headings) +for heading in headings: + print(f"Titre de niveau {heading['level']}: {heading['text']}") + for subheading in heading["children"]: + print(f" Sous-titre de niveau {subheading['level']}: {subheading['text']}") + for subsubheading in subheading["children"]: + print( + f" Sous-sous-titre de niveau {subsubheading['level']}: {subsubheading['text']}" + ) diff --git a/generate_v3.py b/generate_v3.py new file mode 100644 index 0000000..bb93199 --- /dev/null +++ b/generate_v3.py @@ -0,0 +1,67 @@ +import os +import requests +from bs4 import BeautifulSoup +import markdown + + +def parse_markdown_file(markdown_file_path, base_dir="./"): + with open(markdown_file_path, "r") as f: + markdown_text = f.read() + + html = markdown.markdown(markdown_text, extensions=["fenced_code"]) + + soup = BeautifulSoup(html, "html.parser") + + if not os.path.exists(base_dir): + os.makedirs(base_dir) + + current_heading_level = 1 + current_heading_dir = base_dir + last_heading_dir = base_dir + + for element in soup.children: + if element.name in ["h1", "h2", "h3"]: + # Get the text of the heading and the heading level + heading_text = element.text.strip() + heading_level = int(element.name[1]) + + # Determine the directory to create for the heading + if heading_level == current_heading_level: + heading_dir = os.path.join(last_heading_dir, heading_text) + else: + heading_dir = os.path.join(current_heading_dir, heading_text) + + if not os.path.exists(heading_dir): + os.makedirs(heading_dir) + + # Set the current heading level and directory + current_heading_level = heading_level + last_heading_dir = current_heading_dir + current_heading_dir = heading_dir + + elif element.name == "hr": + current_heading_level = 0 + current_heading_dir = base_dir + + elif element.name == "ul" and current_heading_level != 0: + # Get the links in the list + links = element.find_all("a") + for link in links: + # Get the text and href of the link + link_text = link.text.strip() + link_url = link["href"] + + file_path = os.path.join( + current_heading_dir, os.path.basename(link_text) + ) + if link_url: + with open(file_path, "wb") as f: + response = requests.get(link_url[:-1] + "/download") + f.write(response.content) + print("Downloaded", link_url, "to", file_path) + + +headings = parse_markdown_file( + "content/posts/my-first-post.md", base_dir="./content/posts" +) +print(headings)