Open
Description
Describe the bug
When trying to run the example found here: https://unstructured-io.github.io/unstructured/ingest/source_connectors/github.html
I get the following error:
`Ran 1 test in 3.557s
FAILED (errors=1)
https://github.com/Unstructured-IO/unstructured
2023-12-05 10:56:44,983 MainProcess DEBUG updating download directory to: C:\Users\kpanca1\.cache\unstructured\ingest\github\30491e0f70
2023-12-05 10:56:45,639 MainProcess INFO running pipeline: DocFactory -> Reader -> Partitioner -> Copier with config: {"reprocess": false, "verbose": true, "work_dir": "C:\\Users\\kpanca1\\.cache\\unstructured\\ingest\\pipeline", "output_dir": "github-ingest-output", "num_processes": 2, "raise_on_error": false}
2023-12-05 10:56:47,936 MainProcess INFO Running doc factory to generate ingest docs. Source connector: {"processor_config": {"reprocess": false, "verbose": true, "work_dir": "C:\\Users\\kpanca1\\.cache\\unstructured\\ingest\\pipeline", "output_dir": "github-ingest-output", "num_processes": 2, "raise_on_error": false}, "read_config": {"download_dir": "C:\\Users\\kpanca1\\.cache\\unstructured\\ingest\\github\\30491e0f70", "re_download": false, "preserve_downloads": false, "download_only": false, "max_docs": null}, "connector_config": {"url": "Unstructured/unstructured", "access_token": null, "branch": "main", "file_glob": null, "repo_path": "Unstructured/unstructured"}}
Error
Traceback (most recent call last):
File "C:\_git\unstructured-cracker\venv\Lib\site-packages\unstructured\ingest\error.py", line 19, in wrapper
return f(*args, **kwargs)
^^^^^^^^^^^^^^^^^^
File "C:\_git\unstructured-cracker\venv\Lib\site-packages\unstructured\utils.py", line 214, in wrapper
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "C:\_git\unstructured-cracker\venv\Lib\site-packages\unstructured\ingest\connector\github.py", line 49, in get_repo
return github.get_repo(self.repo_path)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\_git\unstructured-cracker\venv\Lib\site-packages\github\MainClass.py", line 380, in get_repo
headers, data = self.__requester.requestJsonAndCheck("GET", url)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\_git\unstructured-cracker\venv\Lib\site-packages\github\Requester.py", line 494, in requestJsonAndCheck
return self.__check(*self.requestJson(verb, url, parameters, headers, input, self.__customConnection(url)))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\_git\unstructured-cracker\venv\Lib\site-packages\github\Requester.py", line 525, in __check
raise self.createException(status, responseHeaders, data)
github.GithubException.UnknownObjectException: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:\_git\unstructured-cracker\tests\test_loaders.py", line 10, in test_load
loader.load()
File "C:\_git\unstructured-cracker\unstructured_cracker\loader\document_loaders\github.py", line 22, in load
runner.run(
File "C:\_git\unstructured-cracker\venv\Lib\site-packages\unstructured\ingest\runner\github.py", line 50, in run
self.process_documents(source_doc_connector=source_doc_connector)
File "C:\_git\unstructured-cracker\venv\Lib\site-packages\unstructured\ingest\runner\base_runner.py", line 57, in process_documents
process_documents(
File "C:\_git\unstructured-cracker\venv\Lib\site-packages\unstructured\ingest\processor.py", line 90, in process_documents
pipeline.run()
File "C:\_git\unstructured-cracker\venv\Lib\site-packages\unstructured\ingest\pipeline\pipeline.py", line 51, in run
dict_docs = self.doc_factory_node()
^^^^^^^^^^^^^^^^^^^^^^^
File "C:\_git\unstructured-cracker\venv\Lib\site-packages\unstructured\ingest\pipeline\interfaces.py", line 67, in __call__
self.result = self.run()
^^^^^^^^^^
File "C:\_git\unstructured-cracker\venv\Lib\site-packages\unstructured\ingest\pipeline\doc_factory.py", line 10, in run
docs = self.source_doc_connector.get_ingest_docs()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\_git\unstructured-cracker\venv\Lib\site-packages\unstructured\ingest\connector\github.py", line 158, in get_ingest_docs
repo = self.connector_config.get_repo()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\_git\unstructured-cracker\venv\Lib\site-packages\unstructured\ingest\error.py", line 22, in wrapper
raise cls(cls.error_string.format(str(error))) from error
unstructured.ingest.error.SourceConnectionError: Error in getting data from upstream data source: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
Process finished with exit code 1
`
To Reproduce
Run the following code:
runner = GithubRunner( processor_config=ProcessorConfig( verbose=True, output_dir="github-ingest-output", num_processes=2, ), read_config=ReadConfig(), partition_config=PartitionConfig(), ) runner.run( url="https://github.com/Unstructured-IO/unstructured", git_branch="main" )
Expected behavior
No errors and for output to be thrown into a file