diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d7b6454 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +test.py +test.ipynb +/dist +/build +**/__pycache__ diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..dcb1530 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,4 @@ +{ + "python.analysis.typeCheckingMode": "basic", + "python.analysis.autoImportCompletions": true +} \ No newline at end of file diff --git a/README.md b/README.md index ea3eda9..258ffb6 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,25 @@ # repo2prompt -Turn a Github Repo's contents into a big prompt for long-context models like Claude 3 Opus. - - Open In Colab - +This is a simple package with minimal dependencies that turns a Github Repo's contents into a big prompt for long-context models. -Super easy: -You will need a Github repo URL (public) and a Github access token. You can also use this with private repos but your token will need to have those permissions. +this work for repos containing rust, python, javascript containing via the following file types: +'.py', '.ipynb', '.html', '.css', '.js', '.jsx', '.rst', '.md', '.rs' -Within the build_directory_tree function, you can specify which file extensions should be included in the output. +Example Usage: -The output is saved to a .txt file with name [repo]-formatted-prompt.txt +```python +from repo2prompt.extraction import extract_repo + +extract_repo(github_url="https://github.com/vllm-project/vllm/tree/main", github_token="your_github_token") +``` + +Or + +```python +from repo2prompt.extraction import extract_repo + +extract_repo(github_url="https://github.com/vllm-project/vllm/tree/main") # os.getenv("GITHUB_TOKEN") used internally +``` + +an important thing to note, github only allows 5000 requests per hour, so be careful -By the way, Github is limited to 5,000 API requests per hour so if a bug happens, that might be why! diff --git a/build/lib/repo2prompt/__init__.py b/build/lib/repo2prompt/__init__.py new file mode 100644 index 0000000..ca8a898 --- /dev/null +++ b/build/lib/repo2prompt/__init__.py @@ -0,0 +1,2 @@ +from .extraction import extract_repo + diff --git a/build/lib/repo2prompt/extraction.py b/build/lib/repo2prompt/extraction.py new file mode 100644 index 0000000..4ede5ba --- /dev/null +++ b/build/lib/repo2prompt/extraction.py @@ -0,0 +1,152 @@ +import os +import base64 +from urllib.parse import urlparse +from typing import Optional +from tqdm import tqdm +from typing import List, Dict, Any +import asyncio +import aiohttp +from .types import RateLimitExceeded + + +def parse_github_url(url): + """ + Parses your GitHub URL and extracts the repository owner and name. + """ + parsed_url = urlparse(url) + path_segments = parsed_url.path.strip("/").split("/") + if len(path_segments) >= 2: + owner, repo = path_segments[0], path_segments[1] + return owner, repo + else: + raise ValueError("Invalid GitHub URL provided!") + +async def fetch_repo_content(owner, repo, path='', token=None): + """ + Fetches the content of your GitHub repository. + """ + base_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}" + headers = {"Accept": "application/vnd.github.v3+json"} + if token: + headers["Authorization"] = f"Bearer {token}" + async with aiohttp.ClientSession() as session: + async with session.get(base_url, headers=headers) as response: + if response.status == 200: + return await response.json() + else: + if response.status == 403: + raise RateLimitExceeded + + else: + raise Exception(f"Error fetching content: {response.status}") + +def get_file_content(file_info): + """ + Retrieves and decodes the content of files + """ + if file_info['encoding'] == 'base64': + return base64.b64decode(file_info['content']).decode('utf-8') + else: + return file_info['content'] + + + +async def build_directory_tree( + owner: str, + repo: str, + path: str = '', + token: Optional[str] = None, + indent: int = 0, + file_paths: List[tuple[int, str]] = [], + is_base: bool = False +) -> tuple[str, List[tuple[int, str]]]: + + async def process_item(item: Dict[str, Any], tree_str: str, file_paths: List[tuple[int, str]], indent: int) -> tuple[str, List[tuple[int, str]]]: + if '.github' in item['path'].split('/'): + pass + if item['type'] == 'dir': + tree_str += ' ' * indent + f"[{item['name']}/]\n" + tree_str += (await build_directory_tree(owner, repo, item['path'], token, indent + 1, file_paths, is_base=False))[0] + else: + tree_str += ' ' * indent + f"{item['name']}\n" + # Indicate which file extensions should be included in the prompt! + if item['name'].endswith(('.py', '.ipynb', '.html', '.css', '.js', '.jsx', '.rst', '.md', '.rs',)): + file_paths.append((indent, item['path'])) + return tree_str, file_paths + + items = await fetch_repo_content(owner, repo, path, token) + if items is None: + return "", file_paths + tree_str = "" + tasks = [process_item(item, "", file_paths, indent) for item in items] + file_paths = [] + tree_str = "" + if is_base: + for future in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Building tree"): + res = await future + tree_str += res[0] + file_paths.extend(res[1]) + else: + for future in asyncio.as_completed(tasks): + res = await future + tree_str += res[0] + file_paths.extend(res[1]) + return tree_str, file_paths + + + + +async def fetch_file_content(args, semaphore) -> str: + owner, repo, path, token, indent = args + async with semaphore: + file_info = await fetch_repo_content(owner, repo, path, token) + file_content = get_file_content(file_info) + return '\n' + ' ' * indent + f"{path}:\n" + ' ' * indent + '\n' + file_content + '\n' + ' ' * indent + '\n' + +async def fetch_file_contents(owner, repo, file_paths, github_token) -> str: + semaphore = asyncio.Semaphore(100) # Limit the number of concurrent file fetches + tasks = [ + fetch_file_content( + (owner, repo, path, github_token, indent), semaphore + ) + for indent, path in file_paths + ] + + # we use asyncio.gather to ensure the order of results matches the order of tasks + formatted_contents = await asyncio.gather(*tasks) + return ''.join(formatted_contents) + +async def extract_repo( + github_url: str, + github_token: Optional[str] = None, +) -> tuple[str, str]: + ''' + Args: + github_url : str : A URL to a Github repository, must use tree/main or tree/branch_name + github_token : Optional[str] : A Github personal access token, if not provided will use the GITHUB_TOKEN env variable + Returns: + str : A string representation of the repository information, suitable for use in a prompt + ''' + if github_token is None: + github_token = os.getenv("GITHUB_TOKEN") + if github_url.split('/')[-2] != 'tree': + raise ValueError( + "Please provide a URL that ends with 'tree', 'tree/main', or 'tree/branch_name'. " + f"Got URL: {github_url}" + ) + owner, repo = parse_github_url(github_url) + + readme_info = await fetch_repo_content(owner, repo, 'README.md', github_token) + readme_content = get_file_content(readme_info) + formatted_string = f"README.md:\n\n{readme_content}\n\n\n" + + import time + t0 = time.time() + directory_tree, file_paths = await build_directory_tree(owner, repo, token=github_token, is_base=True) + print(f"Time in build_directory_tree: {time.time() - t0:.2f} seconds") + + import time + t0 = time.time() + formatted_string += await fetch_file_contents(owner, repo, file_paths, github_token) + print(f"Time in fetch_file_contents: {time.time() - t0:.2f} seconds") + return formatted_string, directory_tree diff --git a/repo2prompt.ipynb b/repo2prompt.ipynb deleted file mode 100644 index c555b29..0000000 --- a/repo2prompt.ipynb +++ /dev/null @@ -1,168 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - } - }, - "cells": [ - { - "cell_type": "markdown", - "source": [ - "You will need a Github repo URL (public) and a Github access token.\n", - "You can also use this with private repos but your token will need to have those permissions.\n", - "\n", - "Within the build_directory_tree function, you can specify which file extensions should be included in the output.\n", - "\n", - "The output is saved to a .txt file with name [repo]-formatted-prompt.txt" - ], - "metadata": { - "id": "H0WyoRb5kAw0" - } - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SrLm4adgYrgi" - }, - "outputs": [], - "source": [ - "import requests\n", - "import base64\n", - "from urllib.parse import urlparse\n", - "\n", - "def parse_github_url(url):\n", - " \"\"\"\n", - " Parses your GitHub URL and extracts the repository owner and name.\n", - " \"\"\"\n", - " parsed_url = urlparse(url)\n", - " path_segments = parsed_url.path.strip(\"/\").split(\"/\")\n", - " if len(path_segments) >= 2:\n", - " owner, repo = path_segments[0], path_segments[1]\n", - " return owner, repo\n", - " else:\n", - " raise ValueError(\"Invalid GitHub URL provided!\")\n", - "\n", - "def fetch_repo_content(owner, repo, path='', token=None):\n", - " \"\"\"\n", - " Fetches the content of your GitHub repository.\n", - " \"\"\"\n", - " base_url = f\"https://api.github.com/repos/{owner}/{repo}/contents/{path}\"\n", - " headers = {\"Accept\": \"application/vnd.github.v3+json\"}\n", - " if token:\n", - " headers[\"Authorization\"] = f\"Bearer {token}\"\n", - " response = requests.get(base_url, headers=headers)\n", - " if response.status_code == 200:\n", - " return response.json()\n", - " else:\n", - " response.raise_for_status()\n", - "\n", - "def get_file_content(file_info):\n", - " \"\"\"\n", - " Retrieves and decodes the content of files\n", - " \"\"\"\n", - " if file_info['encoding'] == 'base64':\n", - " return base64.b64decode(file_info['content']).decode('utf-8')\n", - " else:\n", - " return file_info['content']\n", - "\n", - "def build_directory_tree(owner, repo, path='', token=None, indent=0, file_paths=[]):\n", - " \"\"\"\n", - " Builds a string representation of the directory tree and collects file paths.\n", - " \"\"\"\n", - " items = fetch_repo_content(owner, repo, path, token)\n", - " tree_str = \"\"\n", - " for item in items:\n", - " if '.github' in item['path'].split('/'):\n", - " continue\n", - " if item['type'] == 'dir':\n", - " tree_str += ' ' * indent + f\"[{item['name']}/]\\n\"\n", - " tree_str += build_directory_tree(owner, repo, item['path'], token, indent + 1, file_paths)[0]\n", - " else:\n", - " tree_str += ' ' * indent + f\"{item['name']}\\n\"\n", - " # Indicate which file extensions should be included in the prompt!\n", - " if item['name'].endswith(('.py', '.ipynb', '.html', '.css', '.js', '.jsx', '.rst', '.md')):\n", - " file_paths.append((indent, item['path']))\n", - " return tree_str, file_paths\n", - "\n", - "def retrieve_github_repo_info(url, token=None):\n", - " \"\"\"\n", - " Retrieves and formats repository information, including README, the directory tree,\n", - " and file contents, while ignoring the .github folder.\n", - " \"\"\"\n", - " owner, repo = parse_github_url(url)\n", - "\n", - " try:\n", - " readme_info = fetch_repo_content(owner, repo, 'README.md', token)\n", - " readme_content = get_file_content(readme_info)\n", - " formatted_string = f\"README.md:\\n```\\n{readme_content}\\n```\\n\\n\"\n", - " except Exception as e:\n", - " formatted_string = \"README.md: Not found or error fetching README\\n\\n\"\n", - "\n", - " directory_tree, file_paths = build_directory_tree(owner, repo, token=token)\n", - "\n", - " formatted_string += f\"Directory Structure:\\n{directory_tree}\\n\"\n", - "\n", - " for indent, path in file_paths:\n", - " file_info = fetch_repo_content(owner, repo, path, token)\n", - " file_content = get_file_content(file_info)\n", - " formatted_string += '\\n' + ' ' * indent + f\"{path}:\\n\" + ' ' * indent + '```\\n' + file_content + '\\n' + ' ' * indent + '```\\n'\n", - "\n", - " return formatted_string" - ] - }, - { - "cell_type": "code", - "source": [ - "# You provide a Github repo URL and a Github personal access token.\n", - "# How to get an access token: https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens\n", - "github_url = \"https://github.com/nomic-ai/nomic/tree/main\"\n", - "token = # Github access token (go to Developer Settings to generate one)\n", - "\n", - "owner, repo = parse_github_url(github_url)\n", - "output_file_name = f\"{repo}-formatted-prompt.txt\"\n", - "\n", - "formatted_repo_info = retrieve_github_repo_info(github_url, token = token)\n", - "with open(output_file_name, 'w', encoding='utf-8') as file:\n", - " file.write(formatted_repo_info)\n", - "\n", - "print(f\"Repository information has been saved to {output_file_name}\")" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "cuV5LirEa5jI", - "outputId": "e89a5307-03f8-48e4-d721-88bb5c32e55c" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Repository information has been saved to nomic-formatted-prompt.txt\n" - ] - } - ] - }, - { - "cell_type": "code", - "source": [], - "metadata": { - "id": "rRBY0el6cDg5" - }, - "execution_count": null, - "outputs": [] - } - ] -} \ No newline at end of file diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..a05e60f --- /dev/null +++ b/setup.cfg @@ -0,0 +1,21 @@ +[metadata] +name = repo2prompt +version = 0.1.0 +author = Your Name +author_email = your.email@example.com +description = A simple description of my package +long_description = file: README.md +long_description_content_type = text/markdown +url = https://github.com/yourusername/my_package +classifiers = + Programming Language :: Python :: 3 + License :: OSI Approved :: MIT License + Operating System :: OS Independent +[options] +package_dir = + = src +packages = find: +python_requires = >=3.6 + +[options.packages.find] +where = src \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..3bb7b99 --- /dev/null +++ b/setup.py @@ -0,0 +1,14 @@ +from setuptools import setup, find_packages + +setup( + name='Repo2Prompt', + version='0.1.1', + description='Github repo -> prompt string', + packages=find_packages(where="src"), + package_dir={"": "src"}, + install_requires=[ + 'requests>=2.20.0', + 'tqdm>=4.60.0', + 'aiohttp>=3.9.1', + ], +) \ No newline at end of file diff --git a/src/repo2prompt/__init__.py b/src/repo2prompt/__init__.py new file mode 100644 index 0000000..ca8a898 --- /dev/null +++ b/src/repo2prompt/__init__.py @@ -0,0 +1,2 @@ +from .extraction import extract_repo + diff --git a/src/repo2prompt/extraction.py b/src/repo2prompt/extraction.py new file mode 100644 index 0000000..de8c6ac --- /dev/null +++ b/src/repo2prompt/extraction.py @@ -0,0 +1,150 @@ +import os +import base64 +from urllib.parse import urlparse +from typing import Optional +from tqdm import tqdm +from typing import List, Dict, Any +import asyncio +import aiohttp +from .types import RateLimitExceeded +import time + +def parse_github_url(url): + """ + Parses your GitHub URL and extracts the repository owner and name. + """ + parsed_url = urlparse(url) + path_segments = parsed_url.path.strip("/").split("/") + if len(path_segments) >= 2: + owner, repo = path_segments[0], path_segments[1] + return owner, repo + else: + raise ValueError("Invalid GitHub URL provided!") + +async def fetch_repo_content(owner, repo, path='', token=None): + """ + Fetches the content of your GitHub repository. + """ + base_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}" + headers = {"Accept": "application/vnd.github.v3+json"} + if token: + headers["Authorization"] = f"Bearer {token}" + async with aiohttp.ClientSession() as session: + async with session.get(base_url, headers=headers) as response: + if response.status == 200: + return await response.json() + else: + if response.status == 403: + raise RateLimitExceeded + + else: + raise Exception(f"Error fetching content: {response.status}") + +def get_file_content(file_info): + """ + Retrieves and decodes the content of files + """ + if file_info['encoding'] == 'base64': + return base64.b64decode(file_info['content']).decode('utf-8') + else: + return file_info['content'] + + + +async def build_directory_tree( + owner: str, + repo: str, + path: str = '', + token: Optional[str] = None, + indent: int = 0, + file_paths: List[tuple[int, str]] = [], + is_base: bool = False +) -> tuple[str, List[tuple[int, str]]]: + + async def process_item(item: Dict[str, Any], tree_str: str, file_paths: List[tuple[int, str]], indent: int) -> tuple[str, List[tuple[int, str]]]: + if '.github' in item['path'].split('/'): + pass + if item['type'] == 'dir': + tree_str += ' ' * indent + f"[{item['name']}/]\n" + tree_str += (await build_directory_tree(owner, repo, item['path'], token, indent + 1, file_paths, is_base=False))[0] + else: + tree_str += ' ' * indent + f"{item['name']}\n" + # Indicate which file extensions should be included in the prompt! + if item['name'].endswith(('.py', '.ipynb', '.html', '.css', '.js', '.jsx', '.rst', '.md', '.rs',)): + file_paths.append((indent, item['path'])) + return tree_str, file_paths + + items = await fetch_repo_content(owner, repo, path, token) + if items is None: + return "", file_paths + tree_str = "" + tasks = [process_item(item, "", file_paths, indent) for item in items] + file_paths = [] + tree_str = "" + if is_base: + for future in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Building tree"): + res = await future + tree_str += res[0] + file_paths.extend(res[1]) + else: + for future in asyncio.as_completed(tasks): + res = await future + tree_str += res[0] + file_paths.extend(res[1]) + return tree_str, file_paths + +async def fetch_file_content(args, semaphore) -> str: + owner, repo, path, token, indent = args + async with semaphore: + file_info = await fetch_repo_content(owner, repo, path, token) + file_content = get_file_content(file_info) + return '\n' + ' ' * indent + f"{path}:\n" + ' ' * indent + '\n' + file_content + '\n' + ' ' * indent + '\n' + +async def fetch_file_contents(owner, repo, file_paths, github_token, concurrency) -> str: + semaphore = asyncio.Semaphore(concurrency) # Limit the number of concurrent file fetches + tasks = [ + fetch_file_content( + (owner, repo, path, github_token, indent), semaphore + ) + for indent, path in file_paths + ] + + # we use asyncio.gather to ensure the order of results matches the order of tasks + formatted_contents = await asyncio.gather(*tasks) + return ''.join(formatted_contents) + +async def extract_repo( + github_url: str, + github_token: Optional[str] = None, + max_concurrent_requests: int = 100 +) -> tuple[str, str]: + ''' + Args: + github_url : str, A URL to a Github repository, must use tree/main or tree/branch_name + github_token : Optional[str], A Github personal access token, if not provided will use the GITHUB_TOKEN env variable + max_concurrent_requests : int, The number of concurrent files that are being read + Returns: + str : A string representation of the repository information, suitable for use in a prompt + ''' + if github_token is None: + github_token = os.getenv("GITHUB_TOKEN") + if github_url.split('/')[-2] != 'tree': + raise ValueError( + "Please provide a URL that ends with 'tree', 'tree/main', or 'tree/branch_name'. " + f"Got URL: {github_url}" + ) + owner, repo = parse_github_url(github_url) + + readme_info = await fetch_repo_content(owner, repo, 'README.md', github_token) + readme_content = get_file_content(readme_info) + formatted_string = f"README.md:\n\n{readme_content}\n\n\n" + + t0 = time.time() + directory_tree, file_paths = await build_directory_tree(owner, repo, token=github_token, is_base=True) + print(f"Time in build_directory_tree: {time.time() - t0:.2f} seconds") + t0 = time.time() + formatted_string += await fetch_file_contents( + owner, repo, file_paths, github_token, max_concurrent_requests + ) + print(f"Time in fetch_file_contents: {time.time() - t0:.2f} seconds") + return formatted_string, directory_tree diff --git a/src/repo2prompt/types.py b/src/repo2prompt/types.py new file mode 100644 index 0000000..b141de8 --- /dev/null +++ b/src/repo2prompt/types.py @@ -0,0 +1,10 @@ +from datetime import datetime +import time +class RateLimitExceeded(Exception): + """ + Exception raised when the rate limit is exceeded + """ + def __init__(self): + future_time = int(time.time()) + 60*60 + super().__init__(f"Rate limit exceeded. Please wait until {datetime.fromtimestamp(future_time)} to try again.") + diff --git a/workingRepo2PromptExtension.zip b/workingRepo2PromptExtension.zip deleted file mode 100644 index 7cc772f..0000000 Binary files a/workingRepo2PromptExtension.zip and /dev/null differ