[new] reflection about generator

2 years ago · 1c1e698924
2 changed files with 176 additions and 0 deletions
--- a/generate_v2.py
+++ b/generate_v2.py
@ -0,0 +1,109 @@
+import io
+import os
+from bs4 import BeautifulSoup, Comment
+import markdown
+import ipdb
+
+parent_dir = "./content"
+
+
+def parse_markdown_file(file_path):
+    with open(file_path, "r") as file:
+        markdown_text = file.read()
+        html = markdown.markdown(markdown_text)
+        print(html)
+        headings = []
+        current_level = 0
+        for line in html.split("\n"):
+            if line.startswith("<h1>"):
+                current_level = 1
+                headings.append(
+                    {"level": current_level, "text": line[4:-5], "children": []}
+                )
+            elif line.startswith("<h2>"):
+                if current_level < 2:
+                    current_level = 2
+                    headings[-1]["children"].append(
+                        {"level": current_level, "text": line[4:-5], "children": []}
+                    )
+                else:
+                    headings[-1]["children"].append(
+                        {"level": current_level, "text": line[4:-5], "children": []}
+                    )
+            elif line.startswith("<h3>"):
+                if current_level < 3:
+                    current_level = 3
+                    headings[-1]["children"][-1]["children"].append(
+                        {"level": current_level, "text": line[4:-5], "children": []}
+                    )
+                else:
+                    headings[-1]["children"][-1]["children"].append(
+                        {"level": current_level, "text": line[4:-5], "children": []}
+                    )
+        return headings
+
+
+def parse_markdown_file_2(file_path):
+    with open(file_path, "r", encoding="utf-8") as f:
+        content = f.read()
+    html = markdown.markdown(content)
+    print(html)
+    soup = BeautifulSoup(html, "html.parser")
+    headings = []
+
+    def parse_element(element, level):
+        print(element)
+        if element.name == "h1":
+            heading = {
+                "text": element.text.strip(),
+                "level": level,
+                "subheadings": [],
+                "links": [],
+            }
+            headings.append(heading)
+        elif element.name == "h2":
+            subheading = {
+                "text": element.text.strip(),
+                "level": level,
+                "subheadings": [],
+                "links": [],
+            }
+            headings[-1]["subheadings"].append(subheading)
+        elif element.name == "h3":
+            subsubheading = {"text": element.text.strip(), "level": level, "links": []}
+            headings[-1]["subheadings"][-1]["subheadings"].append(subsubheading)
+        elif element.name == "ul":
+            links = []
+            for li in element.find_all("li"):
+                link = li.find("a")
+                if link is not None:
+                    links.append({"text": link.text.strip(), "url": link["href"]})
+            if level == 1:
+                headings[-1]["links"].extend(links)
+            elif level == 2:
+                headings[-1]["subheadings"][-1]["links"].extend(links)
+            elif level == 3:
+                headings[-1]["subheadings"][-1]["subheadings"][-1]["links"].extend(
+                    links
+                )
+
+        for child in element.children:
+            if isinstance(child, str) or isinstance(child, Comment):
+                continue
+            parse_element(child, level + 1)
+
+    parse_element(soup, 0)
+
+    return headings
+
+
+headings = parse_markdown_file_2("content/posts/my-first-post.md")
+print(headings)
+for heading in headings:
+    print(f"Titre de niveau {heading['level']}: {heading['text']}")
+    for subheading in heading["children"]:
+        print(f"   Sous-titre de niveau {subheading['level']}: {subheading['text']}")
+        for subsubheading in subheading["children"]:
+            print(
+                f"      Sous-sous-titre de niveau {subsubheading['level']}: {subsubheading['text']}"
+            )
--- a/generate_v3.py
+++ b/generate_v3.py
@ -0,0 +1,67 @@
+import os
+import requests
+from bs4 import BeautifulSoup
+import markdown
+
+
+def parse_markdown_file(markdown_file_path, base_dir="./"):
+    with open(markdown_file_path, "r") as f:
+        markdown_text = f.read()
+
+    html = markdown.markdown(markdown_text, extensions=["fenced_code"])
+
+    soup = BeautifulSoup(html, "html.parser")
+
+    if not os.path.exists(base_dir):
+        os.makedirs(base_dir)
+
+    current_heading_level = 1
+    current_heading_dir = base_dir
+    last_heading_dir = base_dir
+
+    for element in soup.children:
+        if element.name in ["h1", "h2", "h3"]:
+            # Get the text of the heading and the heading level
+            heading_text = element.text.strip()
+            heading_level = int(element.name[1])
+
+            # Determine the directory to create for the heading
+            if heading_level == current_heading_level:
+                heading_dir = os.path.join(last_heading_dir, heading_text)
+            else:
+                heading_dir = os.path.join(current_heading_dir, heading_text)
+
+            if not os.path.exists(heading_dir):
+                os.makedirs(heading_dir)
+
+            # Set the current heading level and directory
+            current_heading_level = heading_level
+            last_heading_dir = current_heading_dir
+            current_heading_dir = heading_dir
+
+        elif element.name == "hr":
+            current_heading_level = 0
+            current_heading_dir = base_dir
+
+        elif element.name == "ul" and current_heading_level != 0:
+            # Get the links in the list
+            links = element.find_all("a")
+            for link in links:
+                # Get the text and href of the link
+                link_text = link.text.strip()
+                link_url = link["href"]
+
+                file_path = os.path.join(
+                    current_heading_dir, os.path.basename(link_text)
+                )
+                if link_url:
+                    with open(file_path, "wb") as f:
+                        response = requests.get(link_url[:-1] + "/download")
+                        f.write(response.content)
+                        print("Downloaded", link_url, "to", file_path)
+
+
+headings = parse_markdown_file(
+    "content/posts/my-first-post.md", base_dir="./content/posts"
+)
+print(headings)