Merge pull request #145 from puja-trivedi/update_genomeannotation_20250221

puja-trivedi · web-flow · commit 9ec80caa4399 · 2025-06-20T15:45:55.000-07:00
updated genome_annotation.yaml to match formatting standards
diff --git a/erdiagram-autogen/bke_taxonomy.md b/erdiagram-autogen/bke_taxonomy.md
@@ -35,6 +35,7 @@ ColorPalette {
 }
 MatrixFile {
     stringList content_url  
+    uriorcurieList xref  
     string id  
     iri_type iri  
     curieList category  
@@ -43,7 +44,6 @@ MatrixFile {
     narrative_text description  
     boolean deprecated  
     stringList provided_by  
-    uriorcurieList xref  
     label_type full_name  
     label_typeList synonym  
 }
@@ -91,6 +91,7 @@ CellSpecimen {
 }
 ObservationRow {
     string label  
+    uriorcurieList xref  
     string id  
     iri_type iri  
     curieList category  
@@ -99,12 +100,12 @@ ObservationRow {
     narrative_text description  
     boolean deprecated  
     stringList provided_by  
-    uriorcurieList xref  
     label_type full_name  
     label_typeList synonym  
 }
 ObservationMatrix {
     stringList content_url  
+    uriorcurieList xref  
     string id  
     iri_type iri  
     curieList category  
@@ -113,7 +114,6 @@ ObservationMatrix {
     narrative_text description  
     boolean deprecated  
     stringList provided_by  
-    uriorcurieList xref  
     label_type full_name  
     label_typeList synonym  
 }
diff --git a/json-schema-autogen/bke_taxonomy.json b/json-schema-autogen/bke_taxonomy.json
@@ -2409,7 +2409,7 @@
         },
         "GeneAnnotation": {
             "additionalProperties": false,
-            "description": "An annotation describing the location, boundaries, and functions of  individual genes within a genome annotation.",
+            "description": "Represents a single gene. Includes metadata about the gene, such as its molecular type and the genome annotation it was referenced from.",
             "properties": {
                 "category": {
                     "description": "Name of the high level ontology class in which this entity is categorized. Corresponds to the label for the biolink entity type class. In a neo4j database this MAY correspond to the neo4j label tag. In an RDF database it should be a biolink model class URI. This field is multi-valued. It should include values for ancestors of the biolink class; for example, a protein such as Shh would have category values `biolink:Protein`, `biolink:GeneProduct`, `biolink:MolecularEntity`. In an RDF database, nodes will typically have an rdf:type triples. This can be to the most specific biolink class, or potentially to a class more specific than something in biolink. For example, a sequence feature `f` may have a rdf:type assertion to a SO class such as TF_binding_site, which is more specific than anything in biolink. Here we would have categories {biolink:GenomicEntity, biolink:MolecularEntity, biolink:NamedThing}. NOTE: The category slot was modified to have a curie range and a pattern for bican categories.",
@@ -2531,7 +2531,7 @@
                             "type": "string"
                         }
                     ],
-                    "description": "The genome annotation that this gene annotation was referenced from.",
+                    "description": "The genome annotation that this gene was referenced from.",
                     "type": "string"
                 },
                 "source_id": {
@@ -2724,7 +2724,7 @@
         },
         "GenomeAnnotation": {
             "additionalProperties": false,
-            "description": "Location and nomenclature of genes and all of the coding regions in a genome assembly  and the classification of genes and transcripts into types.",
+            "description": "Represents a genome annotation. Includes metadata about the genome, such as its version and reference assembly.",
             "properties": {
                 "authority": {
                     "$ref": "#/$defs/AuthorityType",
@@ -2911,7 +2911,7 @@
         },
         "GenomeAssembly": {
             "additionalProperties": false,
-            "description": "Genome assembly to contain version and label information",
+            "description": "Represents a genome assembly. A genome assembly is a computational representation of a genome sequence.",
             "properties": {
                 "category": {
                     "description": "Name of the high level ontology class in which this entity is categorized. Corresponds to the label for the biolink entity type class. In a neo4j database this MAY correspond to the neo4j label tag. In an RDF database it should be a biolink model class URI. This field is multi-valued. It should include values for ancestors of the biolink class; for example, a protein such as Shh would have category values `biolink:Protein`, `biolink:GeneProduct`, `biolink:MolecularEntity`. In an RDF database, nodes will typically have an rdf:type triples. This can be to the most specific biolink class, or potentially to a class more specific than something in biolink. For example, a sequence feature `f` may have a rdf:type assertion to a SO class such as TF_binding_site, which is more specific than anything in biolink. Here we would have categories {biolink:GenomicEntity, biolink:MolecularEntity, biolink:NamedThing}. NOTE: The category slot was modified to have a curie range and a pattern for bican categories.",
diff --git a/json-schema-autogen/genome_annotation.json b/json-schema-autogen/genome_annotation.json
@@ -683,7 +683,7 @@
         },
         "GeneAnnotation": {
             "additionalProperties": false,
-            "description": "An annotation describing the location, boundaries, and functions of  individual genes within a genome annotation.",
+            "description": "Represents a single gene. Includes metadata about the gene, such as its molecular type and the genome annotation it was referenced from.",
             "properties": {
                 "category": {
                     "description": "Name of the high level ontology class in which this entity is categorized. Corresponds to the label for the biolink entity type class. In a neo4j database this MAY correspond to the neo4j label tag. In an RDF database it should be a biolink model class URI. This field is multi-valued. It should include values for ancestors of the biolink class; for example, a protein such as Shh would have category values `biolink:Protein`, `biolink:GeneProduct`, `biolink:MolecularEntity`. In an RDF database, nodes will typically have an rdf:type triples. This can be to the most specific biolink class, or potentially to a class more specific than something in biolink. For example, a sequence feature `f` may have a rdf:type assertion to a SO class such as TF_binding_site, which is more specific than anything in biolink. Here we would have categories {biolink:GenomicEntity, biolink:MolecularEntity, biolink:NamedThing}. NOTE: The category slot was modified to have a curie range and a pattern for bican categories.",
@@ -805,7 +805,7 @@
                             "type": "string"
                         }
                     ],
-                    "description": "The genome annotation that this gene annotation was referenced from.",
+                    "description": "The genome annotation that this gene was referenced from.",
                     "type": "string"
                 },
                 "source_id": {
@@ -998,7 +998,7 @@
         },
         "GenomeAnnotation": {
             "additionalProperties": false,
-            "description": "Location and nomenclature of genes and all of the coding regions in a genome assembly  and the classification of genes and transcripts into types.",
+            "description": "Represents a genome annotation. Includes metadata about the genome, such as its version and reference assembly.",
             "properties": {
                 "authority": {
                     "$ref": "#/$defs/AuthorityType",
@@ -1185,7 +1185,7 @@
         },
         "GenomeAssembly": {
             "additionalProperties": false,
-            "description": "Genome assembly to contain version and label information",
+            "description": "Represents a genome assembly. A genome assembly is a computational representation of a genome sequence.",
             "properties": {
                 "category": {
                     "description": "Name of the high level ontology class in which this entity is categorized. Corresponds to the label for the biolink entity type class. In a neo4j database this MAY correspond to the neo4j label tag. In an RDF database it should be a biolink model class URI. This field is multi-valued. It should include values for ancestors of the biolink class; for example, a protein such as Shh would have category values `biolink:Protein`, `biolink:GeneProduct`, `biolink:MolecularEntity`. In an RDF database, nodes will typically have an rdf:type triples. This can be to the most specific biolink class, or potentially to a class more specific than something in biolink. For example, a sequence feature `f` may have a rdf:type assertion to a SO class such as TF_binding_site, which is more specific than anything in biolink. Here we would have categories {biolink:GenomicEntity, biolink:MolecularEntity, biolink:NamedThing}. NOTE: The category slot was modified to have a curie range and a pattern for bican categories.",
diff --git a/jsonld-context-autogen/bke_taxonomy.context.jsonld b/jsonld-context-autogen/bke_taxonomy.context.jsonld
@@ -512,12 +512,12 @@
       "Gene": {
          "@id": "biolink:Gene"
       },
-      "GeneAnnotation": {
-         "@id": "GeneAnnotation"
-      },
       "GeneOrGeneProduct": {
          "@id": "biolink:GeneOrGeneProduct"
       },
+      "GeneAnnotation": {
+         "@id": "GeneAnnotation"
+      },
       "Genome": {
          "@id": "biolink:Genome"
       },
diff --git a/jsonld-context-autogen/genome_annotation.context.jsonld b/jsonld-context-autogen/genome_annotation.context.jsonld
@@ -214,9 +214,7 @@
          },
          "@id": "authority"
       },
-      "category": {
-         "@id": "biolink:category"
-      },
+      "category": "@type",
       "checksum_algorithm": {
          "@context": {
             "text": "skos:notation",
@@ -384,12 +382,12 @@
       "Gene": {
          "@id": "biolink:Gene"
       },
-      "GeneAnnotation": {
-         "@id": "GeneAnnotation"
-      },
       "GeneOrGeneProduct": {
          "@id": "biolink:GeneOrGeneProduct"
       },
+      "GeneAnnotation": {
+         "@id": "GeneAnnotation"
+      },
       "Genome": {
          "@id": "biolink:Genome"
       },
@@ -448,5 +446,4 @@
          "@id": "biolink:ThingWithTaxon"
       }
    }
-}
-
+}
diff --git a/linkml-schema/bke_taxonomy.yaml b/linkml-schema/bke_taxonomy.yaml
@@ -440,6 +440,7 @@ classes:
     - has_variable
     - was_derived_from
     - content_url
+    - xref
     slot_usage:
       was_generated_by:
         description: The aggregation process from which a observation matrix was generated
@@ -453,7 +454,7 @@ classes:
       has_variable:
         description: One of set of genes which together forms the variable set of
           an observation matrix.
-        range: gene annotation
+        range: GeneAnnotation
         multivalued: true
       was_derived_from:
         description: One of many cell specimens from which observations in the matrix
@@ -464,6 +465,10 @@ classes:
         local_names:
           local_name_value: url
           local_name_source: allen
+      xref:
+        local_names:
+          local_name_value: unique_id
+          local_name_source: allen
   ObservationRow:
     description: One specific row of the observation matrix representing a set of
       measurements preformed on a sample over a set of variables or features.
@@ -476,6 +481,7 @@ classes:
     - represented_in
     - was_derived_from
     - label
+    - xref
     slot_usage:
       part_of_matrix:
         description: The observation matrix for which is observation row is part of.
@@ -487,6 +493,10 @@ classes:
         description: The cell specimen from which the observation was derived from.
         range: CellSpecimen
       label: {}
+      xref:
+        local_names:
+          local_name_value: unique_id
+          local_name_source: allen
   CellSpecimen:
     description: ( defined as in BERS)
     from_schema: https:/w3id.org/brain-bican/bke-taxonomy
@@ -521,7 +531,7 @@ classes:
           local_name_source: allen
         description: One of potentially many gene annotation terms to which the abbreviation
           denotes.
-        range: gene annotation
+        range: GeneAnnotation
         multivalued: true
       denotes_parcellation_term:
         local_names:
@@ -562,11 +572,16 @@ classes:
     - ProvEntity
     slots:
     - content_url
+    - xref
     slot_usage:
       content_url:
         local_names:
           local_name_value: url
           local_name_source: allen
+      xref:
+        local_names:
+          local_name_value: unique_id
+          local_name_source: allen
   ColorPalette:
     description: A schematic set of display colors that can be applied to individual
       components of the associated entity set (for example, a taxonomy).
diff --git a/linkml-schema/genome_annotation.yaml b/linkml-schema/genome_annotation.yaml
@@ -2,7 +2,7 @@ id: https://identifiers.org/brain-bican/genome-annotation-schema
 name: genome-annotation-schema
 title: Genome Annotation Schema
 description: |-
-  The Genome Annotation schema is designed to represent types and relationships of an organism's annotated genome.
+  The Genome Annotation Schema is designed to respresent all the genes from a given genome annotation.
   
 prefixes:
   linkml: https://w3id.org/linkml/
@@ -19,75 +19,68 @@ default_range: string
 default_prefix: bican
 
 classes:
-  gene annotation:
+  GeneAnnotation:
     is_a: gene
     description: >-
-      An annotation describing the location, boundaries, and functions of 
-      individual genes within a genome annotation.
+      Represents a single gene. Includes metadata about the gene, such as its molecular type and the genome annotation it was referenced from.
     slots:
-      - molecular type
-      - source id
+      - molecular_type
+      - source_id
     attributes:
-      referenced in: 
-        description: The genome annotation that this gene annotation was referenced from.
+      referenced_in: 
+        description: The genome annotation that this gene was referenced from.
         required: true
         inlined: true
         any_of:
-          - range: genome annotation
+          - range: GenomeAnnotation
           - range: string
-    id_prefixes:
-      - ENSEMBL
-      - MGI
-      - NCBIGene
 
-  genome annotation:
+  GenomeAnnotation:
     is_a: genome
     description: >-
-      Location and nomenclature of genes and all of the coding regions in a genome assembly 
-      and the classification of genes and transcripts into types.
+      Represents a genome annotation. Includes metadata about the genome, such as its version and reference assembly.
     slots:
       - version
       - digest
       - content_url
       - authority
     attributes:
-      reference assembly:
+      reference_assembly:
         description: The reference genome assembly that this genome annotation was created from.
         required: true
         inlined: true
         any_of:
-          - range: genome assembly
+          - range: GenomeAssembly
           - range: string
 
-  genome assembly:
+  GenomeAssembly:
     is_a: named thing
     mixins:
       - thing with taxon
     description: >-
-      Genome assembly to contain version and label information
+      Represents a genome assembly. A genome assembly is a computational representation of a genome sequence.
     slots:
       - version
       - strain
 
-  annotation collection:
+  AnnotationCollection:
     tree_root: true
     attributes:
       annotations:
         multivalued: true
         inlined_as_list: true
-        range: gene annotation
+        range: GeneAnnotation
       genome_annotations:
         multivalued: true
         inlined_as_list: true
-        range: genome annotation
+        range: GenomeAnnotation
       genome_assemblies:
         multivalued: true
         inlined_as_list: true
-        range: genome assembly
-
+        range: GenomeAssembly
 
 slots:
-  molecular type:
+  molecular_type:
     any_of:
       - range: BioType
       - range: string
@@ -100,14 +93,13 @@ slots:
     description: The organization responsible for publishing the data. 
     range: AuthorityType
 
-  source id:
+  source_id:
     description: The authority specific identifier. 
     slot_uri: schema:identifier
   
   strain:
     description: The genetic variant or subtype of a species or organism. 
 
-
 enums:
   BioType:
     permissible_values:
diff --git a/linkml-schema/source_bke_taxonomy/gsheet_output/Relations.tsv b/linkml-schema/source_bke_taxonomy/gsheet_output/Relations.tsv
@@ -1,7 +1,7 @@
-Object	Predicate	Subject	Multivalued	ExactlyOneOf	Slot Definition	Quasi-formalism	BKE Local Name	Local Name Source
+Class	Predicate	Subject	Multivalued	ExactlyOneOf	Slot Definition	Quasi-formalism	BKE Local Name	Local Name Source
 > class	slot	range	multivalued	ignore	description	ignore	local_names	local_names
 >		internal_separator: "|"					inner_key: local_name_value	inner_key: local_name_source
-Abbreviation	denotes_gene_annotation	gene annotation	TRUE	FALSE	One of potentially many gene annotation terms to which the abbreviation denotes.	Some abbreviation (a) denotes some GeneAnnotation (G) if and only if a refers to G.	denotes	allen
+Abbreviation	denotes_gene_annotation	GeneAnnotation	TRUE	FALSE	One of potentially many gene annotation terms to which the abbreviation denotes.	Some abbreviation (a) denotes some GeneAnnotation (G) if and only if a refers to G.	denotes	allen
 Abbreviation	denotes_parcellation_term	ParcellationTerm	TRUE	FALSE	One of potentially many parcellation terms (anatomical structures) which the abbreviation denotes.	A denotes some PT	denotes	allen
 CellTypeSet	part_of_taxonomy	CellTypeTaxonomy	FALSE	TRUE	The cell types taxonomy for which the cell type set is part of.	CTS part_of_taxonomy CTT if and only if CTT is a taxonomy, and CTS is a member of CTT		
 CellTypeSet	contains_taxon	CellTypeTaxon	TRUE	FALSE	One of potentially many cell types taxons which together defines the cell type set.	CTS contains_taxon CTT if and only if CTT is a part of CTS		
@@ -28,7 +28,7 @@ DisplayColor	is_color_for_taxon	CellTypeTaxon	FALSE	TRUE	The associated cell typ
 DisplayColor	is_color_for_set	CellTypeSet	FALSE	TRUE	The associated cell type set of the specified display color.	DC is_color_for CTS if DC is associated with CTS	is_color_for	allen
 ObservationMatrix	was_generated_by	ObservationMatrixCreationProcess	FALSE	TRUE	The aggregation process from which a observation matrix was generated by.	OM was_generated_by some OMCP if and only if OMCP is a process and OM was_output_of OMCP		
 ObservationMatrix	represented_by	MatrixFile	TRUE	FALSE	One of potentially matrix files which together represents the entire observation matrix.	MF concretizes some OM		
-ObservationMatrix	has_variable	gene annotation	TRUE	FALSE	One of set of genes which together forms the variable set of an observation matrix.			
+ObservationMatrix	has_variable	GeneAnnotation	TRUE	FALSE	One of set of genes which together forms the variable set of an observation matrix.			
 ObservationMatrix	was_derived_from	CellSpecimen	TRUE	FALSE	One of many cell specimens from which observations in the matrix was derived from.	OM was_derived_from some CSp if and only if OM has_input some CSp		
 ObservationRow	part_of_matrix	ObservationMatrix	FALSE	TRUE	The observation matrix for which is observation row is part of.	OR part_of_matrix some OM if and only if OR part_of some OM		
 ObservationRow	represented_in	MatrixFile	FALSE	TRUE	The specific file where this observation row is represented.	OR represented in MF if and only if OR is part_of some OM and OM is_concretized_in some MF		
diff --git a/linkml-schema/source_bke_taxonomy/gsheet_output/Slots.tsv b/linkml-schema/source_bke_taxonomy/gsheet_output/Slots.tsv
@@ -45,5 +45,8 @@ DisplayColor	id		( database GUID)	FALSE		string
 DisplayColor	color_hex_triplet	#450099	A hex string representing the display color for an associated entity.	FALSE		string			
 DisplayColor	xref							unique_id	allen
 MatrixFile	content_url							url	allen
+MatrixFile	xref							unique_id	allen
 ObservationMatrix	content_url							url	allen
-ObservationRow	label								
+ObservationMatrix	xref							unique_id	allen
+ObservationRow	label								
+ObservationRow	xref							unique_id	allen
diff --git a/models_py-autogen/bke_taxonomy.py b/models_py-autogen/bke_taxonomy.py
diff --git a/models_py-autogen/genome_annotation.py b/models_py-autogen/genome_annotation.py