diff --git a/src/api.py b/src/api.py index 6b4f6a6..33e577a 100644 --- a/src/api.py +++ b/src/api.py @@ -68,7 +68,7 @@ async def gimie(full_path:str, return {"link": full_path, "output": gimie_output} -@app.get("/v1/llm/{full_path:path}") +@app.get("/v1/llm/json-ld/{full_path:path}") async def llm(full_path:str): try: @@ -82,6 +82,20 @@ async def llm(full_path:str): return {"link": full_path, "output": llm_result} +@app.get("/v1/llm/json/{full_path:path}") +async def llm(full_path:str): + + try: + llm_result = llm_request_repo_infos(str(full_path), output_format="json") + except Exception as e: + raise HTTPException( + status_code=424, + detail=f"Error from LLM service: {e}" + ) + + return {"link": full_path, + "output": llm_result} + @app.exception_handler(ValueError) async def value_error_exception_handler(request: Request, exc: ValueError): return JSONResponse( diff --git a/src/core/genai_model.py b/src/core/genai_model.py index e4fcccd..ede5deb 100644 --- a/src/core/genai_model.py +++ b/src/core/genai_model.py @@ -110,7 +110,7 @@ def clone_repo(repo_url): return None -def llm_request_repo_infos(repo_url): +def llm_request_repo_infos(repo_url, output_format="json-ld"): # Clone the GitHub repository into a temporary folder with tempfile.TemporaryDirectory() as temp_dir: logger.info(f"Cloning {repo_url} into {temp_dir}...") @@ -146,7 +146,6 @@ def llm_request_repo_infos(repo_url): raw_result = response.json()["choices"][0]["message"]["content"] parsed_result = clean_json_string(raw_result) json_data = json.loads(parsed_result) - pprint(json_data) logger.info("Successfully parsed API response") @@ -161,7 +160,13 @@ def llm_request_repo_infos(repo_url): # TODO. This is hardcoded. Not good. context_path = "src/files/json-ld-context.json" # Now convert cleaned data to JSON-LD - return json_to_jsonLD(cleaned_json, context_path) + if output_format == "json-ld": + return json_to_jsonLD(cleaned_json, context_path) + elif output_format == "json": + return cleaned_json + else: + logger.error(f"Unsupported output format: {output_format}") + return None except Exception as e: logger.error(f"Error parsing response: {e}") diff --git a/src/core/models.py b/src/core/models.py index 7c4aec6..4f58890 100644 --- a/src/core/models.py +++ b/src/core/models.py @@ -57,6 +57,62 @@ class Image(BaseModel): contentUrl: HttpUrl = None keywords: ImageKeyword = ImageKeyword.ILLUSTRATIVE_IMAGE + +class Discipline(str, Enum): + SOCIAL_SCIENCES = "Social sciences" + ANTHROPOLOGY = "Anthropology" + COMMUNICATION_STUDIES = "Communication studies" + EDUCATION = "Education" + LINGUISTICS = "Linguistics" + RESEARCH = "Research" + SOCIOLOGY = "Sociology" + GEOGRAPHY = "Geography" + PSYCHOLOGY = "Psychology" + POLITICS = "Politics" + ECONOMICS = "Economics" + APPLIED_SCIENCES = "Applied sciences" + HEALTH_SCIENCES = "Health sciences" + ELECTRICAL_ENGINEERING = "Electrical engineering" + CHEMICAL_ENGINEERING = "Chemical engineering" + CIVIL_ENGINEERING = "Civil engineering" + ARCHITECTURE = "Architecture" + COMPUTER_ENGINEERING = "Computer engineering" + ENERGY_ENGINEERING = "Energy engineering" + MILITARY_SCIENCE = "Military science" + INDUSTRIAL_PRODUCTION_ENGINEERING = "Industrial and production engineering" + MECHANICAL_ENGINEERING = "Mechanical engineering" + BIOLOGICAL_ENGINEERING = "Biological engineering" + ENVIRONMENTAL_SCIENCE = "Environmental science" + SYSTEMS_SCIENCE_ENGINEERING = "Systems science and engineering" + INFORMATION_ENGINEERING = "Information engineering" + AGRICULTURAL_FOOD_SCIENCES = "Agricultural and food sciences" + BUSINESS = "Business" + HUMANITIES = "Humanities" + HISTORY = "History" + LITERATURE = "Literature" + ART = "Art" + RELIGION = "Religion" + PHILOSOPHY = "Philosophy" + LAW = "Law" + FORMAL_SCIENCES = "Formal sciences" + MATHEMATICS = "Mathematics" + LOGIC = "Logic" + STATISTICS = "Statistics" + THEORETICAL_COMPUTER_SCIENCE = "Theoretical computer science" + NATURAL_SCIENCES = "Natural sciences" + PHYSICS = "Physics" + ASTRONOMY = "Astronomy" + BIOLOGY = "Biology" + CHEMISTRY = "Chemistry" + EARTH_SCIENCE = "Earth science" + +class RepositoryType(str, Enum): + SOFTWARE = "software" + EDUCATIONAL_RESOURCE = "educational resource" + DOCUMENTATION = "documentation" + DATA = "data" + OTHER = "other" + class SoftwareSourceCode(BaseModel): name: Optional[str] = None applicationCategory: Optional[List[str]] = None @@ -74,6 +130,7 @@ class SoftwareSourceCode(BaseModel): license: Annotated[str, StringConstraints(pattern=r"spdx\.org.*")] = None author: List[Union[Person, Organization]] = None relatedToOrganization: Optional[List[str]] = None + relatedToOrganizationJustification: Optional[List[str]] = None operatingSystem: Optional[List[str]] = None programmingLanguage: Optional[List[str]] = None softwareRequirements: Optional[List[str]] = None @@ -94,6 +151,10 @@ class SoftwareSourceCode(BaseModel): imagingModality: Optional[List[str]] = None fairLevel: Optional[str] = None graph: Optional[str] = None + discipline: Optional[List[Discipline]] = None + disciplineJustification: Optional[List[str]] = None + repositoryType: Optional[RepositoryType] = None + respositoryTypeJustification: Optional[List[str]] = None ############################################################ diff --git a/src/core/prompts.py b/src/core/prompts.py index 2a047b1..d85ccae 100644 --- a/src/core/prompts.py +++ b/src/core/prompts.py @@ -36,6 +36,7 @@ - `orcidId` (valid URL, **optional**) - `affiliation` (list of strings, **optional**): Institutions the author is affiliated with. Do not mention Imaging Plaza unless is explicity mentioned. - `relatedToOrganization` (list of strings, **optional**): Institutions associated with the software. Do not mention Imaging Plaza unless is explicity mentioned. +- `relatedToOrganizationJustification` (list of strings, **optional**): Justification for the related organizations. - `softwareRequirements` (list of strings, **optional**): Dependencies or prerequisites for running the software. - `operatingSystem` (list of strings, **optional**): Compatible operating systems. Use only Windows, Linux, MacOS, or Other. - `programmingLanguage` (list of strings, **optional**): Programming languages used in the software. @@ -92,18 +93,10 @@ - `hasExecutableInstructions` (string, **optional**): Any exectuable instructions related to the software. This should point to an URL where the installation is explained. If this is the README file, please make the full URL. - `readme` (valid URL, **optional**): README url of the software (at the root of the repo) - `imagingModality (list of strings, **optional**): imaging modalities accepted by the software. - - -When dealing with Organization pay attention to -- -- -- - -When parsing Persons note: -- -- -- - +- `discipline` (string, **optional**): Scientific discipline the software belongs to. Base your response on the README and other documentation files content. +- `disciplineJustification` (list of strings, **optional**): Justification for the discipline classification. +- `repositoryType` (string, **optional**): Type of repository (e.g., software, educational resource, documentation, data, other). +- `respositoryTypeJustification` (list of strings, **optional**): Justification for the repository type classification. PLEASE PROVIDE THE OUTPUT IN JSON FORMAT ONLY, WITHOUT ANY EXPLANATION OR ADDITIONAL TEXT. ALIGN THE RESPONSE TO THE SCHEMA SPECIFICATION. """ diff --git a/src/files/json-ld-context.json b/src/files/json-ld-context.json index a48a4fd..f5e039d 100644 --- a/src/files/json-ld-context.json +++ b/src/files/json-ld-context.json @@ -76,7 +76,9 @@ "hasRorId": "md4i:hasRorId", "legalName": "schema:legalName", "fundingGrant": "sd:fundingGrant", - "fundingSource": "sd:fundingSource" + "fundingSource": "sd:fundingSource", + "discipline": "pulse:discipline", + "repositoryType": "pulse:repositoryType" } }