zhangtao
/
smqjh-refuel


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
							import re
import os
import argparse

def split_llms(input_file, output_dir, remove_input=False):
    print(f"Reading from: {input_file}")
    print(f"Outputting to: {output_dir}")

    if not os.path.exists(input_file):
        print(f"Error: Input file {input_file} not found.")
        return

    with open(input_file, 'r', encoding='utf-8') as f:
        content = f.read()

    # Split by "---" lines which seem to separate sections
    # However, "---" is also used for frontmatter.
    # The structure seems to be:
    # ---
    # url: '...'
    # ---
    # # Title
    # ... content ...

    # We can split by "\n---\n" but this tears apart the frontmatter.
    # Let's try to split by the `url` marker which is quite unique in this file context

    # Alternatively, keep the sections and merge them.
    chunks = re.split(r'\n---\n', content)

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    merged_sections = []
    current_metadata = {}

    for chunk in chunks:
        chunk = chunk.strip()
        if not chunk:
            continue

        # Check if this chunk is metadata (contains url:)
        url_match = re.search(r"^url: '(.*?)'", chunk, re.MULTILINE)
        if url_match:
            current_metadata['url'] = url_match.group(1)
            continue

        # If not metadata, it's content. If we have pending metadata, apply it.
        filename = None
        if 'url' in current_metadata:
            url = current_metadata['url']
            filename = url.split('/')[-1]
            if not filename.endswith('.md'):
                filename += '.md'
            # Reset metadata after using it
        else:
            # Fallback for content without preceding metadata
            lines = chunk.split('\n')
            title_line = next((line for line in lines if line.startswith('# ')), None)
            if title_line:
                title = title_line.replace('# ', '').strip().split(' ')[0].lower()
                filename = f"{title}.md"
            else:
                 # Skip chunks that are likely just navigation or noise
                 continue

        if filename:
            filepath = os.path.join(output_dir, filename)

            # Construct file content
            file_content = ""
            if 'url' in current_metadata:
                file_content += "---\n"
                file_content += f"url: '{current_metadata['url']}'\n"
                file_content += "---\n\n"
                # Reset metadata
                current_metadata = {}

            file_content += chunk

            with open(filepath, 'w', encoding='utf-8') as f:
                f.write(file_content)
            print(f"Created {filepath}")

    if remove_input:
        try:
            os.remove(input_file)
            print(f"Successfully removed input file: {input_file}")
        except OSError as e:
            print(f"Error removing input file: {e}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Split llms-full.txt into component markdown files.')
    parser.add_argument('--input', type=str, required=True, help='Path to llms-full.txt')
    parser.add_argument('--output', type=str, required=True, help='Output directory for references')
    parser.add_argument('--remove-input', action='store_true', help='Remove the input file after processing')

    args = parser.parse_args()

    split_llms(args.input, args.output, args.remove_input)