You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

67 lines
2.2 KiB

  1. import os
  2. import requests
  3. from bs4 import BeautifulSoup
  4. import markdown
  5. def parse_markdown_file(markdown_file_path, base_dir="./"):
  6. with open(markdown_file_path, "r") as f:
  7. markdown_text = f.read()
  8. html = markdown.markdown(markdown_text, extensions=["fenced_code"])
  9. soup = BeautifulSoup(html, "html.parser")
  10. if not os.path.exists(base_dir):
  11. os.makedirs(base_dir)
  12. current_heading_level = 1
  13. current_heading_dir = base_dir
  14. last_heading_dir = base_dir
  15. for element in soup.children:
  16. if element.name in ["h1", "h2", "h3"]:
  17. # Get the text of the heading and the heading level
  18. heading_text = element.text.strip()
  19. heading_level = int(element.name[1])
  20. # Determine the directory to create for the heading
  21. if heading_level == current_heading_level:
  22. heading_dir = os.path.join(last_heading_dir, heading_text)
  23. else:
  24. heading_dir = os.path.join(current_heading_dir, heading_text)
  25. if not os.path.exists(heading_dir):
  26. os.makedirs(heading_dir)
  27. # Set the current heading level and directory
  28. current_heading_level = heading_level
  29. last_heading_dir = current_heading_dir
  30. current_heading_dir = heading_dir
  31. elif element.name == "hr":
  32. current_heading_level = 0
  33. current_heading_dir = base_dir
  34. elif element.name == "ul" and current_heading_level != 0:
  35. # Get the links in the list
  36. links = element.find_all("a")
  37. for link in links:
  38. # Get the text and href of the link
  39. link_text = link.text.strip()
  40. link_url = link["href"]
  41. file_path = os.path.join(
  42. current_heading_dir, os.path.basename(link_text)
  43. )
  44. if link_url:
  45. with open(file_path, "wb") as f:
  46. response = requests.get(link_url[:-1] + "/download")
  47. f.write(response.content)
  48. print("Downloaded", link_url, "to", file_path)
  49. headings = parse_markdown_file(
  50. "content/posts/my-first-post.md", base_dir="./content/posts"
  51. )
  52. print(headings)