From f8a7e90fc209a9be18b9b66f2e04a623296d51ab Mon Sep 17 00:00:00 2001 From: Shane Date: Mon, 3 Apr 2023 17:03:41 +1000 Subject: [PATCH] - Add token limit flag and file limit, so output can be split by tokens - Update docs to support - Add requirements file. This gives a solution for #35, #37. Perhaps #26 --- .gitignore | 3 +- README.md | 39 +++++++++++++ gpt_repository_loader.py | 123 +++++++++++++++++++++++++++++++++------ requirements.txt | 1 + 4 files changed, 148 insertions(+), 18 deletions(-) create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore index fe4f192..372752d 100644 --- a/.gitignore +++ b/.gitignore @@ -23,4 +23,5 @@ venv/ *.swp *.swo *.swn -*~ \ No newline at end of file +*~ +.idea/ \ No newline at end of file diff --git a/README.md b/README.md index 782758a..786ac27 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,14 @@ ## Contributing Some context around building this is [located here](https://github.com/mpoon/gpt-repository-loader/discussions/18). Appreciate any issues and pull requests in the spirit of having mostly GPT build out this tool. Using [ChatGPT Plus](https://chat.openai.com/) is recommended for quick access to GPT-4. +## Requirements + +You'll need following dependencies with Python. + +``` +pip install tiktoken +``` + ## Getting Started To get started with `gpt-repository-loader`, follow these steps: @@ -21,6 +29,37 @@ To get started with `gpt-repository-loader`, follow these steps: 5. The tool will generate an output.txt file containing the text representation of the repository. You can now use this file as input for AI language models or other text-based processing tasks. +## Token Limit + +You can use the token limit parameter to limit the output tokens per file. This is useful if you need to split the +output to multiple files, so that each request can be split to GPT-4's request limits. Your API may have a 4k, 8k or +a 32k token limit. + + ```bash + python gpt_repository_loader.py /path/to/git/repository [-p /path/to/preamble.txt] [-o /path/to/output_file.txt] + [-t 8000] + ``` + +Above command will output multiple files, starting with an index of 1. Each file will have around 8k tokens. For +example, + +``` +/path/to/output_file_1.txt +/path/to/output_file_2.txt +/path/to/output_file_3.txt +/path/to/output_file_4.txt +/path/to/output_file_5.txt +``` + +By default, the max output is limited to 5 files. Any content that doesn't fit the limit is ignored. You can override +the max output limit by adding `-m` argument. +Following example splits the output to 20 files, with each file having a token size of 32k. + + ```bash + python gpt_repository_loader.py /path/to/git/repository [-p /path/to/preamble.txt] [-o /path/to/output_file.txt] + [-t 32000] [-m 20] + ``` + ## Running Tests To run the tests for `gpt-repository-loader`, follow these steps: diff --git a/gpt_repository_loader.py b/gpt_repository_loader.py index 57279f5..0d480f0 100755 --- a/gpt_repository_loader.py +++ b/gpt_repository_loader.py @@ -3,6 +3,8 @@ import os import sys import fnmatch +import tiktoken + def get_ignore_list(ignore_file_path): ignore_list = [] @@ -13,13 +15,44 @@ def get_ignore_list(ignore_file_path): ignore_list.append(line.strip()) return ignore_list + def should_ignore(file_path, ignore_list): for pattern in ignore_list: if fnmatch.fnmatch(file_path, pattern): return True return False -def process_repository(repo_path, ignore_list, output_file): + +def get_token_count(string, model_name='gpt-4'): + encoding = tiktoken.encoding_for_model(model_name) + return len(encoding.encode(string)) + + +def write_preamble(file, preamble_file=None): + # Get preamble text + if preamble_file: + with open(preamble_file, 'r') as pf: + preamble_text = pf.read() + else: + preamble_text = "The following text is a Git repository with code. The structure of the text are sections that begin with ----, followed by a single line containing the file path and file name, followed by a variable amount of lines containing the file contents. The text representing the Git repository ends when the symbols --END-- are encounted. Any further text beyond --END-- are meant to be interpreted as instructions using the aforementioned Git repository as context.\n" + + file.write(f"{preamble_text}\n") + + +def close_output_file(file): + file.write("--END--") + file.close() + + +def process_repository(repo_path, ignore_list, output_file_path, tokens_per_file=-1, preamble_path=None, + max_output_files=5): + # Initialize output file index + output_file_index = 1 + + current_output_file = None + written_token_count = 0 + token_count = 0 + for root, _, files in os.walk(repo_path): for file in files: file_path = os.path.join(root, file) @@ -28,13 +61,68 @@ def process_repository(repo_path, ignore_list, output_file): if not should_ignore(relative_file_path, ignore_list): with open(file_path, 'r', errors='ignore') as file: contents = file.read() - output_file.write("-" * 4 + "\n") - output_file.write(f"{relative_file_path}\n") - output_file.write(f"{contents}\n") + + if tokens_per_file < 0: + # no token limit - write to a single file + output_file = open(f"{output_file_path}", "w") + output_file.write("-" * 4 + "\n") + output_file.write(f"{relative_file_path}\n") + output_file.write(f"{contents}\n") + else: + # use the given token limit, write to multiple files + token_count = get_token_count(contents, 'gpt-4') + output_file_base, output_file_extension = os.path.splitext(output_file_path) + output_path_with_index = f"{output_file_base}_{output_file_index}{output_file_extension}" + + # if there's no file, create a new one + if not current_output_file: + print(f"Writing to file {output_path_with_index}") + current_output_file = open(output_path_with_index, "w") + write_preamble(current_output_file, preamble_path) + + # if the new token count after written exceeds the limit, close the file and start a new one. + if (written_token_count + token_count) > tokens_per_file: + # Close the current output file if it exists and update the output file index + if current_output_file: + close_output_file(current_output_file) + current_output_file = None + + # Show the token count used + print(f"Written " + str(written_token_count) + " tokens to file.") + + # Create a new output file + output_file_index += 1 + + # If the max file limit reached, skip + if output_file_index > max_output_files: + print("Max file limit reached. Quitting early.") + return output_file_index - 1 + + output_path_with_index = f"{output_file_base}_{output_file_index}{output_file_extension}" + output_file_base, output_file_extension = os.path.splitext(output_file_path) + current_output_file = open(f"{output_file_base}_{output_file_index}" + f"{output_file_extension}", "w") + + print(f"Writing to file {output_path_with_index}") + write_preamble(current_output_file, preamble_path) + written_token_count = 0 + + current_output_file.write("-" * 4 + "\n") + current_output_file.write(f"{relative_file_path}\n") + current_output_file.write(f"{contents}\n") + written_token_count += token_count + + # after iterating through all files, if there's an open file, close it + if current_output_file: + close_output_file(current_output_file) + + return output_file_index + if __name__ == "__main__": if len(sys.argv) < 2: - print("Usage: python git_to_text.py /path/to/git/repository [-p /path/to/preamble.txt] [-o /path/to/output_file.txt]") + print( + "Usage: python git_to_text.py /path/to/git/repository [-p /path/to/preamble.txt] [-o /path/to/output_file.txt]") sys.exit(1) repo_path = sys.argv[1] @@ -55,20 +143,21 @@ def process_repository(repo_path, ignore_list, output_file): if "-o" in sys.argv: output_file_path = sys.argv[sys.argv.index("-o") + 1] + tokens_per_file = -1 + if "-t" in sys.argv: + tokens_per_file = int(sys.argv[sys.argv.index("-t") + 1]) + + max_output_files = 5 + if "-m" in sys.argv: + max_output_files = int(sys.argv[sys.argv.index("-m") + 1]) + if os.path.exists(ignore_file_path): ignore_list = get_ignore_list(ignore_file_path) else: ignore_list = [] - with open(output_file_path, 'w') as output_file: - if preamble_file: - with open(preamble_file, 'r') as pf: - preamble_text = pf.read() - output_file.write(f"{preamble_text}\n") - else: - output_file.write("The following text is a Git repository with code. The structure of the text are sections that begin with ----, followed by a single line containing the file path and file name, followed by a variable amount of lines containing the file contents. The text representing the Git repository ends when the symbols --END-- are encounted. Any further text beyond --END-- are meant to be interpreted as instructions using the aforementioned Git repository as context.\n") - process_repository(repo_path, ignore_list, output_file) - with open(output_file_path, 'a') as output_file: - output_file.write("--END--") - print(f"Repository contents written to {output_file_path}.") - \ No newline at end of file + output_file_index = process_repository(repo_path, ignore_list, output_file_path, tokens_per_file, preamble_file, + max_output_files) + + # Display final message + print(f"Repository contents written to {output_file_index} file(s).") diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..152fbd5 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +tiktoken==0.3.3 \ No newline at end of file