Skip to content

Commit

Permalink
- Add token limit flag and file limit, so output can be split by tokens
Browse files Browse the repository at this point in the history
- Update docs to support
- Add requirements file.

This gives a solution for mpoon#35, mpoon#37. Perhaps mpoon#26
  • Loading branch information
shanecp committed Apr 3, 2023
1 parent c243824 commit f8a7e90
Show file tree
Hide file tree
Showing 4 changed files with 148 additions and 18 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,5 @@ venv/
*.swp
*.swo
*.swn
*~
*~
.idea/
39 changes: 39 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,14 @@
## Contributing
Some context around building this is [located here](https://github.com/mpoon/gpt-repository-loader/discussions/18). Appreciate any issues and pull requests in the spirit of having mostly GPT build out this tool. Using [ChatGPT Plus](https://chat.openai.com/) is recommended for quick access to GPT-4.

## Requirements

You'll need following dependencies with Python.

```
pip install tiktoken
```

## Getting Started

To get started with `gpt-repository-loader`, follow these steps:
Expand All @@ -21,6 +29,37 @@ To get started with `gpt-repository-loader`, follow these steps:

5. The tool will generate an output.txt file containing the text representation of the repository. You can now use this file as input for AI language models or other text-based processing tasks.

## Token Limit

You can use the token limit parameter to limit the output tokens per file. This is useful if you need to split the
output to multiple files, so that each request can be split to GPT-4's request limits. Your API may have a 4k, 8k or
a 32k token limit.
```bash
python gpt_repository_loader.py /path/to/git/repository [-p /path/to/preamble.txt] [-o /path/to/output_file.txt]
[-t 8000]
```
Above command will output multiple files, starting with an index of 1. Each file will have around 8k tokens. For
example,
```
/path/to/output_file_1.txt
/path/to/output_file_2.txt
/path/to/output_file_3.txt
/path/to/output_file_4.txt
/path/to/output_file_5.txt
```
By default, the max output is limited to 5 files. Any content that doesn't fit the limit is ignored. You can override
the max output limit by adding `-m` argument.
Following example splits the output to 20 files, with each file having a token size of 32k.

```bash
python gpt_repository_loader.py /path/to/git/repository [-p /path/to/preamble.txt] [-o /path/to/output_file.txt]
[-t 32000] [-m 20]
```

## Running Tests

To run the tests for `gpt-repository-loader`, follow these steps:
Expand Down
123 changes: 106 additions & 17 deletions gpt_repository_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import os
import sys
import fnmatch
import tiktoken


def get_ignore_list(ignore_file_path):
ignore_list = []
Expand All @@ -13,13 +15,44 @@ def get_ignore_list(ignore_file_path):
ignore_list.append(line.strip())
return ignore_list


def should_ignore(file_path, ignore_list):
for pattern in ignore_list:
if fnmatch.fnmatch(file_path, pattern):
return True
return False

def process_repository(repo_path, ignore_list, output_file):

def get_token_count(string, model_name='gpt-4'):
encoding = tiktoken.encoding_for_model(model_name)
return len(encoding.encode(string))


def write_preamble(file, preamble_file=None):
# Get preamble text
if preamble_file:
with open(preamble_file, 'r') as pf:
preamble_text = pf.read()
else:
preamble_text = "The following text is a Git repository with code. The structure of the text are sections that begin with ----, followed by a single line containing the file path and file name, followed by a variable amount of lines containing the file contents. The text representing the Git repository ends when the symbols --END-- are encounted. Any further text beyond --END-- are meant to be interpreted as instructions using the aforementioned Git repository as context.\n"

file.write(f"{preamble_text}\n")


def close_output_file(file):
file.write("--END--")
file.close()


def process_repository(repo_path, ignore_list, output_file_path, tokens_per_file=-1, preamble_path=None,
max_output_files=5):
# Initialize output file index
output_file_index = 1

current_output_file = None
written_token_count = 0
token_count = 0

for root, _, files in os.walk(repo_path):
for file in files:
file_path = os.path.join(root, file)
Expand All @@ -28,13 +61,68 @@ def process_repository(repo_path, ignore_list, output_file):
if not should_ignore(relative_file_path, ignore_list):
with open(file_path, 'r', errors='ignore') as file:
contents = file.read()
output_file.write("-" * 4 + "\n")
output_file.write(f"{relative_file_path}\n")
output_file.write(f"{contents}\n")

if tokens_per_file < 0:
# no token limit - write to a single file
output_file = open(f"{output_file_path}", "w")
output_file.write("-" * 4 + "\n")
output_file.write(f"{relative_file_path}\n")
output_file.write(f"{contents}\n")
else:
# use the given token limit, write to multiple files
token_count = get_token_count(contents, 'gpt-4')
output_file_base, output_file_extension = os.path.splitext(output_file_path)
output_path_with_index = f"{output_file_base}_{output_file_index}{output_file_extension}"

# if there's no file, create a new one
if not current_output_file:
print(f"Writing to file {output_path_with_index}")
current_output_file = open(output_path_with_index, "w")
write_preamble(current_output_file, preamble_path)

# if the new token count after written exceeds the limit, close the file and start a new one.
if (written_token_count + token_count) > tokens_per_file:
# Close the current output file if it exists and update the output file index
if current_output_file:
close_output_file(current_output_file)
current_output_file = None

# Show the token count used
print(f"Written " + str(written_token_count) + " tokens to file.")

# Create a new output file
output_file_index += 1

# If the max file limit reached, skip
if output_file_index > max_output_files:
print("Max file limit reached. Quitting early.")
return output_file_index - 1

output_path_with_index = f"{output_file_base}_{output_file_index}{output_file_extension}"
output_file_base, output_file_extension = os.path.splitext(output_file_path)
current_output_file = open(f"{output_file_base}_{output_file_index}"
f"{output_file_extension}", "w")

print(f"Writing to file {output_path_with_index}")
write_preamble(current_output_file, preamble_path)
written_token_count = 0

current_output_file.write("-" * 4 + "\n")
current_output_file.write(f"{relative_file_path}\n")
current_output_file.write(f"{contents}\n")
written_token_count += token_count

# after iterating through all files, if there's an open file, close it
if current_output_file:
close_output_file(current_output_file)

return output_file_index


if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python git_to_text.py /path/to/git/repository [-p /path/to/preamble.txt] [-o /path/to/output_file.txt]")
print(
"Usage: python git_to_text.py /path/to/git/repository [-p /path/to/preamble.txt] [-o /path/to/output_file.txt]")
sys.exit(1)

repo_path = sys.argv[1]
Expand All @@ -55,20 +143,21 @@ def process_repository(repo_path, ignore_list, output_file):
if "-o" in sys.argv:
output_file_path = sys.argv[sys.argv.index("-o") + 1]

tokens_per_file = -1
if "-t" in sys.argv:
tokens_per_file = int(sys.argv[sys.argv.index("-t") + 1])

max_output_files = 5
if "-m" in sys.argv:
max_output_files = int(sys.argv[sys.argv.index("-m") + 1])

if os.path.exists(ignore_file_path):
ignore_list = get_ignore_list(ignore_file_path)
else:
ignore_list = []

with open(output_file_path, 'w') as output_file:
if preamble_file:
with open(preamble_file, 'r') as pf:
preamble_text = pf.read()
output_file.write(f"{preamble_text}\n")
else:
output_file.write("The following text is a Git repository with code. The structure of the text are sections that begin with ----, followed by a single line containing the file path and file name, followed by a variable amount of lines containing the file contents. The text representing the Git repository ends when the symbols --END-- are encounted. Any further text beyond --END-- are meant to be interpreted as instructions using the aforementioned Git repository as context.\n")
process_repository(repo_path, ignore_list, output_file)
with open(output_file_path, 'a') as output_file:
output_file.write("--END--")
print(f"Repository contents written to {output_file_path}.")

output_file_index = process_repository(repo_path, ignore_list, output_file_path, tokens_per_file, preamble_file,
max_output_files)

# Display final message
print(f"Repository contents written to {output_file_index} file(s).")
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
tiktoken==0.3.3

0 comments on commit f8a7e90

Please sign in to comment.