You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

109 lines
3.8 KiB

  1. import io
  2. import os
  3. from bs4 import BeautifulSoup, Comment
  4. import markdown
  5. import ipdb
  6. parent_dir = "./content"
  7. def parse_markdown_file(file_path):
  8. with open(file_path, "r") as file:
  9. markdown_text = file.read()
  10. html = markdown.markdown(markdown_text)
  11. print(html)
  12. headings = []
  13. current_level = 0
  14. for line in html.split("\n"):
  15. if line.startswith("<h1>"):
  16. current_level = 1
  17. headings.append(
  18. {"level": current_level, "text": line[4:-5], "children": []}
  19. )
  20. elif line.startswith("<h2>"):
  21. if current_level < 2:
  22. current_level = 2
  23. headings[-1]["children"].append(
  24. {"level": current_level, "text": line[4:-5], "children": []}
  25. )
  26. else:
  27. headings[-1]["children"].append(
  28. {"level": current_level, "text": line[4:-5], "children": []}
  29. )
  30. elif line.startswith("<h3>"):
  31. if current_level < 3:
  32. current_level = 3
  33. headings[-1]["children"][-1]["children"].append(
  34. {"level": current_level, "text": line[4:-5], "children": []}
  35. )
  36. else:
  37. headings[-1]["children"][-1]["children"].append(
  38. {"level": current_level, "text": line[4:-5], "children": []}
  39. )
  40. return headings
  41. def parse_markdown_file_2(file_path):
  42. with open(file_path, "r", encoding="utf-8") as f:
  43. content = f.read()
  44. html = markdown.markdown(content)
  45. print(html)
  46. soup = BeautifulSoup(html, "html.parser")
  47. headings = []
  48. def parse_element(element, level):
  49. print(element)
  50. if element.name == "h1":
  51. heading = {
  52. "text": element.text.strip(),
  53. "level": level,
  54. "subheadings": [],
  55. "links": [],
  56. }
  57. headings.append(heading)
  58. elif element.name == "h2":
  59. subheading = {
  60. "text": element.text.strip(),
  61. "level": level,
  62. "subheadings": [],
  63. "links": [],
  64. }
  65. headings[-1]["subheadings"].append(subheading)
  66. elif element.name == "h3":
  67. subsubheading = {"text": element.text.strip(), "level": level, "links": []}
  68. headings[-1]["subheadings"][-1]["subheadings"].append(subsubheading)
  69. elif element.name == "ul":
  70. links = []
  71. for li in element.find_all("li"):
  72. link = li.find("a")
  73. if link is not None:
  74. links.append({"text": link.text.strip(), "url": link["href"]})
  75. if level == 1:
  76. headings[-1]["links"].extend(links)
  77. elif level == 2:
  78. headings[-1]["subheadings"][-1]["links"].extend(links)
  79. elif level == 3:
  80. headings[-1]["subheadings"][-1]["subheadings"][-1]["links"].extend(
  81. links
  82. )
  83. for child in element.children:
  84. if isinstance(child, str) or isinstance(child, Comment):
  85. continue
  86. parse_element(child, level + 1)
  87. parse_element(soup, 0)
  88. return headings
  89. headings = parse_markdown_file_2("content/posts/my-first-post.md")
  90. print(headings)
  91. for heading in headings:
  92. print(f"Titre de niveau {heading['level']}: {heading['text']}")
  93. for subheading in heading["children"]:
  94. print(f" Sous-titre de niveau {subheading['level']}: {subheading['text']}")
  95. for subsubheading in subheading["children"]:
  96. print(
  97. f" Sous-sous-titre de niveau {subsubheading['level']}: {subsubheading['text']}"
  98. )