You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

160 lines
5.6 KiB

  1. import os
  2. import re
  3. import requests
  4. from bs4 import BeautifulSoup
  5. import markdown
  6. def parse_for_toc(content):
  7. # TODO: ADD .TableofContent directly to the page (see hugo theme ?)
  8. content = content.replace("[TOC]", "")
  9. return content
  10. def parse_for_notice(content):
  11. result = re.findall(":::warning.*?:::", content, re.MULTILINE | re.DOTALL)
  12. for notice in result:
  13. old = notice
  14. content = content.replace(
  15. old,
  16. notice.replace(":::warning", "{{% notice info %}}").replace(
  17. ":::", "{{% /notice %}}"
  18. ),
  19. )
  20. result = re.findall(":::info.*?:::", content, re.MULTILINE | re.DOTALL)
  21. for notice in result:
  22. old = notice
  23. content = content.replace(
  24. old,
  25. notice.replace(":::info", "{{% notice note %}}").replace(
  26. ":::", "{{% /notice %}}"
  27. ),
  28. )
  29. result = re.findall(":::success.*?:::", content, re.MULTILINE | re.DOTALL)
  30. for notice in result:
  31. old = notice
  32. content = content.replace(
  33. old,
  34. notice.replace(":::success", "{{% notice tip %}}").replace(
  35. ":::", "{{% /notice %}}"
  36. ),
  37. )
  38. result = re.findall(":::danger.*?:::", content, re.MULTILINE | re.DOTALL)
  39. for notice in result:
  40. old = notice
  41. content = content.replace(
  42. old,
  43. notice.replace(":::danger", "{{% notice warning %}}").replace(
  44. ":::", "{{% /notice %}}"
  45. ),
  46. )
  47. # print(content)
  48. return content
  49. def parse_markdown_file(markdown_file_path, base_dir="./"):
  50. with open(markdown_file_path, "r") as f:
  51. markdown_text = f.read()
  52. html = markdown.markdown(markdown_text, extensions=["fenced_code"])
  53. soup = BeautifulSoup(html, "html.parser")
  54. if not os.path.exists(base_dir):
  55. os.makedirs(base_dir)
  56. current_heading_level = 1
  57. current_heading_dir = base_dir
  58. last_heading_dir = base_dir
  59. for element in soup.children:
  60. if element.name in ["h1", "h2", "h3"]:
  61. # Get the text of the heading and the heading level
  62. heading_text = element.text.strip()
  63. heading_level = int(element.name[1])
  64. # Determine the directory to create for the heading
  65. print(
  66. "heading_level: %s(%s) , heading_text: %s, base_dir: %s, last_heading_dir: %s, current_heading_dir: %s "
  67. % (
  68. heading_level,
  69. current_heading_level,
  70. heading_text,
  71. base_dir,
  72. last_heading_dir,
  73. current_heading_dir,
  74. )
  75. )
  76. if heading_level == 1:
  77. heading_dir = os.path.join(base_dir, heading_text)
  78. current_heading_dir = heading_dir
  79. last_heading_dir = base_dir
  80. elif heading_level == current_heading_level:
  81. heading_dir = os.path.join(
  82. os.path.dirname(current_heading_dir), heading_text
  83. )
  84. last_heading_dir = heading_dir
  85. current_heading_dir = heading_dir
  86. elif heading_level >= current_heading_level:
  87. heading_dir = os.path.join(current_heading_dir, heading_text)
  88. last_heading_dir = current_heading_dir
  89. current_heading_dir = heading_dir
  90. else:
  91. print("NOT SUPPORTED YET")
  92. if not os.path.exists(heading_dir):
  93. os.makedirs(heading_dir)
  94. # add _index.md for page organization
  95. with open(heading_dir + "/_index.md", "wb") as f:
  96. print("Created", heading_dir + "/_index.md")
  97. index = """
  98. +++
  99. title = "{heading_text}"
  100. chapter = true
  101. weight = 5
  102. +++
  103. # {heading_text}
  104. Discover what this Hugo theme is all about and the core-concepts behind it.
  105. """.format(
  106. heading_text=heading_text
  107. )
  108. f.write(b"%s" % index.encode("utf-8"))
  109. # Set the current heading level and directory
  110. current_heading_level = heading_level
  111. # last_heading_dir = current_heading_dir
  112. # current_heading_dir = heading_dir
  113. elif element.name == "hr":
  114. current_heading_level = 0
  115. current_heading_dir = base_dir
  116. elif element.name == "ul" and current_heading_level != 0:
  117. # Get the links in the list
  118. links = element.find_all("a")
  119. for link in links:
  120. # Get the text and href of the link
  121. link_text = link.text.strip()
  122. link_url = link["href"]
  123. file_path = os.path.join(
  124. current_heading_dir, os.path.basename(link_text)
  125. )
  126. if link_url:
  127. with open(file_path + ".md", "wb") as f:
  128. doc_link = link_url + "/download"
  129. response = requests.get(doc_link)
  130. hugo_header = '---\ntitle: "' + link_text + '"\n---\n\n'
  131. content = parse_for_notice(
  132. response.content.decode("utf-8")
  133. ).encode("utf-8")
  134. content = parse_for_toc(content.decode("utf-8")).encode("utf-8")
  135. f.write(hugo_header.encode("utf-8"))
  136. f.write(content.replace(b"---", b""))
  137. print("Downloaded", doc_link, "to", file_path)
  138. headings = parse_markdown_file("content/my-first-post.md", base_dir="./content")
  139. print(headings)