split_llms.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
  1. import re
  2. import os
  3. import argparse
  4. def split_llms(input_file, output_dir, remove_input=False):
  5. print(f"Reading from: {input_file}")
  6. print(f"Outputting to: {output_dir}")
  7. if not os.path.exists(input_file):
  8. print(f"Error: Input file {input_file} not found.")
  9. return
  10. with open(input_file, 'r', encoding='utf-8') as f:
  11. content = f.read()
  12. # Split by "---" lines which seem to separate sections
  13. # However, "---" is also used for frontmatter.
  14. # The structure seems to be:
  15. # ---
  16. # url: '...'
  17. # ---
  18. # # Title
  19. # ... content ...
  20. # We can split by "\n---\n" but this tears apart the frontmatter.
  21. # Let's try to split by the `url` marker which is quite unique in this file context
  22. # Alternatively, keep the sections and merge them.
  23. chunks = re.split(r'\n---\n', content)
  24. if not os.path.exists(output_dir):
  25. os.makedirs(output_dir)
  26. merged_sections = []
  27. current_metadata = {}
  28. for chunk in chunks:
  29. chunk = chunk.strip()
  30. if not chunk:
  31. continue
  32. # Check if this chunk is metadata (contains url:)
  33. url_match = re.search(r"^url: '(.*?)'", chunk, re.MULTILINE)
  34. if url_match:
  35. current_metadata['url'] = url_match.group(1)
  36. continue
  37. # If not metadata, it's content. If we have pending metadata, apply it.
  38. filename = None
  39. if 'url' in current_metadata:
  40. url = current_metadata['url']
  41. filename = url.split('/')[-1]
  42. if not filename.endswith('.md'):
  43. filename += '.md'
  44. # Reset metadata after using it
  45. else:
  46. # Fallback for content without preceding metadata
  47. lines = chunk.split('\n')
  48. title_line = next((line for line in lines if line.startswith('# ')), None)
  49. if title_line:
  50. title = title_line.replace('# ', '').strip().split(' ')[0].lower()
  51. filename = f"{title}.md"
  52. else:
  53. # Skip chunks that are likely just navigation or noise
  54. continue
  55. if filename:
  56. filepath = os.path.join(output_dir, filename)
  57. # Construct file content
  58. file_content = ""
  59. if 'url' in current_metadata:
  60. file_content += "---\n"
  61. file_content += f"url: '{current_metadata['url']}'\n"
  62. file_content += "---\n\n"
  63. # Reset metadata
  64. current_metadata = {}
  65. file_content += chunk
  66. with open(filepath, 'w', encoding='utf-8') as f:
  67. f.write(file_content)
  68. print(f"Created {filepath}")
  69. if remove_input:
  70. try:
  71. os.remove(input_file)
  72. print(f"Successfully removed input file: {input_file}")
  73. except OSError as e:
  74. print(f"Error removing input file: {e}")
  75. if __name__ == "__main__":
  76. parser = argparse.ArgumentParser(description='Split llms-full.txt into component markdown files.')
  77. parser.add_argument('--input', type=str, required=True, help='Path to llms-full.txt')
  78. parser.add_argument('--output', type=str, required=True, help='Output directory for references')
  79. parser.add_argument('--remove-input', action='store_true', help='Remove the input file after processing')
  80. args = parser.parse_args()
  81. split_llms(args.input, args.output, args.remove_input)