Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
test.py
test.ipynb
/dist
/build
**/__pycache__
4 changes: 4 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"python.analysis.typeCheckingMode": "basic",
"python.analysis.autoImportCompletions": true
}
28 changes: 19 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,15 +1,25 @@
# repo2prompt
Turn a Github Repo's contents into a big prompt for long-context models like Claude 3 Opus.

<a target="_blank" href="https://colab.research.google.com/github/andrewgcodes/repo2prompt/blob/main/repo2prompt.ipynb">
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>
This is a simple package with minimal dependencies that turns a Github Repo's contents into a big prompt for long-context models.

Super easy:
You will need a Github repo URL (public) and a Github access token. You can also use this with private repos but your token will need to have those permissions.
this work for repos containing rust, python, javascript containing via the following file types:
'.py', '.ipynb', '.html', '.css', '.js', '.jsx', '.rst', '.md', '.rs'

Within the build_directory_tree function, you can specify which file extensions should be included in the output.
Example Usage:

The output is saved to a .txt file with name [repo]-formatted-prompt.txt
```python
from repo2prompt.extraction import extract_repo

extract_repo(github_url="https://github.com/vllm-project/vllm/tree/main", github_token="your_github_token")
```

Or

```python
from repo2prompt.extraction import extract_repo

extract_repo(github_url="https://github.com/vllm-project/vllm/tree/main") # os.getenv("GITHUB_TOKEN") used internally
```

an important thing to note, github only allows 5000 requests per hour, so be careful

By the way, Github is limited to 5,000 API requests per hour so if a bug happens, that might be why!
2 changes: 2 additions & 0 deletions build/lib/repo2prompt/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .extraction import extract_repo

152 changes: 152 additions & 0 deletions build/lib/repo2prompt/extraction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
import os
import base64
from urllib.parse import urlparse
from typing import Optional
from tqdm import tqdm
from typing import List, Dict, Any
import asyncio
import aiohttp
from .types import RateLimitExceeded


def parse_github_url(url):
"""
Parses your GitHub URL and extracts the repository owner and name.
"""
parsed_url = urlparse(url)
path_segments = parsed_url.path.strip("/").split("/")
if len(path_segments) >= 2:
owner, repo = path_segments[0], path_segments[1]
return owner, repo
else:
raise ValueError("Invalid GitHub URL provided!")

async def fetch_repo_content(owner, repo, path='', token=None):
"""
Fetches the content of your GitHub repository.
"""
base_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}"
headers = {"Accept": "application/vnd.github.v3+json"}
if token:
headers["Authorization"] = f"Bearer {token}"
async with aiohttp.ClientSession() as session:
async with session.get(base_url, headers=headers) as response:
if response.status == 200:
return await response.json()
else:
if response.status == 403:
raise RateLimitExceeded

else:
raise Exception(f"Error fetching content: {response.status}")

def get_file_content(file_info):
"""
Retrieves and decodes the content of files
"""
if file_info['encoding'] == 'base64':
return base64.b64decode(file_info['content']).decode('utf-8')
else:
return file_info['content']



async def build_directory_tree(
owner: str,
repo: str,
path: str = '',
token: Optional[str] = None,
indent: int = 0,
file_paths: List[tuple[int, str]] = [],
is_base: bool = False
) -> tuple[str, List[tuple[int, str]]]:

async def process_item(item: Dict[str, Any], tree_str: str, file_paths: List[tuple[int, str]], indent: int) -> tuple[str, List[tuple[int, str]]]:
if '.github' in item['path'].split('/'):
pass
if item['type'] == 'dir':
tree_str += ' ' * indent + f"[{item['name']}/]\n"
tree_str += (await build_directory_tree(owner, repo, item['path'], token, indent + 1, file_paths, is_base=False))[0]
else:
tree_str += ' ' * indent + f"{item['name']}\n"
# Indicate which file extensions should be included in the prompt!
if item['name'].endswith(('.py', '.ipynb', '.html', '.css', '.js', '.jsx', '.rst', '.md', '.rs',)):
file_paths.append((indent, item['path']))
return tree_str, file_paths

items = await fetch_repo_content(owner, repo, path, token)
if items is None:
return "", file_paths
tree_str = ""
tasks = [process_item(item, "", file_paths, indent) for item in items]
file_paths = []
tree_str = ""
if is_base:
for future in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Building tree"):
res = await future
tree_str += res[0]
file_paths.extend(res[1])
else:
for future in asyncio.as_completed(tasks):
res = await future
tree_str += res[0]
file_paths.extend(res[1])
return tree_str, file_paths




async def fetch_file_content(args, semaphore) -> str:
owner, repo, path, token, indent = args
async with semaphore:
file_info = await fetch_repo_content(owner, repo, path, token)
file_content = get_file_content(file_info)
return '\n' + ' ' * indent + f"{path}:\n" + ' ' * indent + '\n' + file_content + '\n' + ' ' * indent + '\n'

async def fetch_file_contents(owner, repo, file_paths, github_token) -> str:
semaphore = asyncio.Semaphore(100) # Limit the number of concurrent file fetches
tasks = [
fetch_file_content(
(owner, repo, path, github_token, indent), semaphore
)
for indent, path in file_paths
]

# we use asyncio.gather to ensure the order of results matches the order of tasks
formatted_contents = await asyncio.gather(*tasks)
return ''.join(formatted_contents)

async def extract_repo(
github_url: str,
github_token: Optional[str] = None,
) -> tuple[str, str]:
'''
Args:
github_url : str : A URL to a Github repository, must use tree/main or tree/branch_name
github_token : Optional[str] : A Github personal access token, if not provided will use the GITHUB_TOKEN env variable
Returns:
str : A string representation of the repository information, suitable for use in a prompt
'''
if github_token is None:
github_token = os.getenv("GITHUB_TOKEN")
if github_url.split('/')[-2] != 'tree':
raise ValueError(
"Please provide a URL that ends with 'tree', 'tree/main', or 'tree/branch_name'. "
f"Got URL: {github_url}"
)
owner, repo = parse_github_url(github_url)

readme_info = await fetch_repo_content(owner, repo, 'README.md', github_token)
readme_content = get_file_content(readme_info)
formatted_string = f"README.md:\n\n{readme_content}\n\n\n"

import time
t0 = time.time()
directory_tree, file_paths = await build_directory_tree(owner, repo, token=github_token, is_base=True)
print(f"Time in build_directory_tree: {time.time() - t0:.2f} seconds")

import time
t0 = time.time()
formatted_string += await fetch_file_contents(owner, repo, file_paths, github_token)
print(f"Time in fetch_file_contents: {time.time() - t0:.2f} seconds")
return formatted_string, directory_tree
168 changes: 0 additions & 168 deletions repo2prompt.ipynb

This file was deleted.

21 changes: 21 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
[metadata]
name = repo2prompt
version = 0.1.0
author = Your Name
author_email = your.email@example.com
description = A simple description of my package
long_description = file: README.md
long_description_content_type = text/markdown
url = https://github.com/yourusername/my_package
classifiers =
Programming Language :: Python :: 3
License :: OSI Approved :: MIT License
Operating System :: OS Independent
[options]
package_dir =
= src
packages = find:
python_requires = >=3.6

[options.packages.find]
where = src
Loading