Merge pull request #13 from sarda-devesh/main

sarda-devesh · web-flow · commit 09e7e444a987 · 2024-11-11T13:29:20.000-06:00
Feedback puller scripts
diff --git a/.gitignore b/.gitignore
@@ -160,4 +160,5 @@ cython_debug/
 #.idea/
 
 macrostrat_db_insertion/actual_macrostrat.json
-macrostrat_db_insertion/temp_data
+macrostrat_db_insertion/temp_data
+retraining_runner/feedback_training_dataset
diff --git a/README.md b/README.md
@@ -76,3 +76,18 @@ with the database.
 ## Frontend React component
 
 The frontend React component can be found in this repo: [UW-Macrostrat/web-components](https://github.com/UW-Macrostrat/web-components/tree/main/packages/feedback-components). A current version of this feedback component can be found at [http://cosmos0003.chtc.wisc.edu:3000/?path=/docs/feedback-components-feedbackcomponent--docs](http://cosmos0003.chtc.wisc.edu:3000/?path=/docs/feedback-components-feedbackcomponent--docs) 
+
+## Feedback puller
+
+Our training scripts use a different format to represent the relationships than the schema defined in `macrostrat_db_insertion/macrostrat_xdd_schema.sql`. Thus we wrote a script (`retraining_runner/feedback_puller.py`) which reads in the feedback from the database and converts it to a format required by the training scripts. The followings arguments must be specified to the script:
+```
+usage: feedback_puller.py [-h] --uri URI --schema SCHEMA --save_dir SAVE_DIR
+
+options:
+  -h, --help           show this help message and exit
+  --uri URI            The URI to use to connect to the database
+  --schema SCHEMA      The schema to connect to
+  --save_dir SAVE_DIR  The directory to save the results to
+```
+
+Checkout this README on how to train the `unsupervised_kg` model on this feedback dataset: [https://github.com/UW-Macrostrat/unsupervised-kg?tab=readme-ov-file#spanbert-training](https://github.com/UW-Macrostrat/unsupervised-kg?tab=readme-ov-file#spanbert-training)
diff --git a/macrostrat_db_insertion/example_requests/feedback_examples/4.json b/macrostrat_db_insertion/example_requests/feedback_examples/4.json
@@ -0,0 +1,127 @@
+{
+    "nodes": [
+      {
+        "id": 392659,
+        "type": 1,
+        "name": "Kubler",
+        "txt_range": [
+          [
+            204,
+            210
+          ]
+        ],
+        "reasoning": null,
+        "match": null
+      },
+      {
+        "id": 392660,
+        "type": 2,
+        "name": "The top of the sedimentary infill",
+        "txt_range": [
+          [
+            0,
+            33
+          ]
+        ],
+        "reasoning": null,
+        "match": null
+      },
+      {
+        "id": 392662,
+        "type": 3,
+        "name": "sedimentary",
+        "txt_range": [
+          [
+            15,
+            26
+          ]
+        ],
+        "reasoning": null,
+        "match": null
+      },
+      {
+        "id": 392664,
+        "type": 3,
+        "name": "quartz",
+        "txt_range": [
+          [
+            78,
+            84
+          ]
+        ],
+        "reasoning": null,
+        "match": {
+          "type": "lith_att",
+          "id": 94
+        }
+      },
+      {
+        "id": 392668,
+        "type": 3,
+        "name": "fine",
+        "txt_range": [
+          [
+            508,
+            512
+          ]
+        ],
+        "reasoning": null,
+        "match": {
+          "type": "lith_att",
+          "id": 45
+        }
+      },
+      {
+        "id": -1,
+        "type": 2,
+        "name": "horizon",
+        "txt_range": [
+          [
+            294,
+            301
+          ]
+        ],
+        "reasoning": null,
+        "match": null
+      },
+      {
+        "id": -2,
+        "type": 3,
+        "name": "acidic",
+        "txt_range": [
+          [
+            433,
+            439
+          ]
+        ],
+        "reasoning": null,
+        "match": null
+      }
+    ],
+    "edges": [
+      {
+        "source": 392659,
+        "dest": 392660
+      },
+      {
+        "source": 392660,
+        "dest": 392662
+      },
+      {
+        "source": 392660,
+        "dest": 392664
+      },
+      {
+        "source": 392660,
+        "dest": 392668
+      },
+      {
+        "source": -1,
+        "dest": -2
+      }
+    ],
+    "sourceTextId": 22950,
+    "supersedesRunIds": [
+      26730
+    ]
+  }
diff --git a/macrostrat_db_insertion/server.py b/macrostrat_db_insertion/server.py
@@ -240,7 +240,7 @@ def get_weaviate_text_id(source_text, request_additional_data, session: Session)
         sources_values["source_text_type"] = curr_text_type
 
         sources_insert_statement = INSERT_STATEMENT(sources_table).values(**sources_values)
-        sources_insert_statement = sources_insert_statement.on_conflict_do_nothing(index_elements = ["source_text_type", "paragraph_text"])
+        sources_insert_statement = sources_insert_statement.on_conflict_do_nothing(index_elements = ["source_text_type", "hashed_text"])
         session.execute(sources_insert_statement)
         session.commit()
     except:
@@ -250,7 +250,7 @@ def get_weaviate_text_id(source_text, request_additional_data, session: Session)
     try:
         source_id_select_statement = SELECT_STATEMENT(sources_table.c.id)
         source_id_select_statement = source_id_select_statement.where(sources_table.c.source_text_type == curr_text_type)
-        source_id_select_statement = source_id_select_statement.where(sources_table.c.paragraph_text == source_text["paragraph_text"])
+        source_id_select_statement = source_id_select_statement.where(sources_table.c.hashed_text == paragraph_hash)
         source_id_result = session.execute(source_id_select_statement).all()
 
         # Ensure we got a result
diff --git a/retraining_runner/feedback_puller.py b/retraining_runner/feedback_puller.py
@@ -3,11 +3,15 @@
 from sqlalchemy.orm import sessionmaker, declarative_base
 from sqlalchemy import select as SELECT_STATEMENT
 import argparse
+import os
+import pandas as pd
+import numpy as np
 
 def read_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--uri", type=str, required=True, help="The URI to use to connect to the database")
     parser.add_argument("--schema", type=str, required=True, help="The schema to connect to")
+    parser.add_argument("--save_dir", type=str, required=True, help="The directory to save the results to")
     return parser.parse_args()
 
 def load_sqlalchemy(args):
@@ -44,6 +48,34 @@ def get_all_user_runs(connection_details):
     
     return all_user_runs
 
+def get_entity_name(connection_details, entity_id):
+    # Load the entity table
+    entities_table_name = get_complete_table_name(connection_details, "entity")
+    entities_table = connection_details["metadata"].tables[entities_table_name]
+    entities_select_statement = SELECT_STATEMENT(entities_table)
+    entities_select_statement = entities_select_statement.where(entities_table.c.id == entity_id)
+
+    # Run the query
+    entities_select_result = connection_details["session"].execute(entities_select_statement).all()
+    if len(entities_select_result) == 0:
+        raise Exception("Can't find entity with id " + str(entities_select_result))
+    
+    return entities_select_result[0]._mapping["name"].strip()
+
+def get_relationship_type(connection_details, relationship_type_id):
+    # Load the relationship type table
+    relationship_table_name = get_complete_table_name(connection_details, "relationship_type")
+    relationship_table = connection_details["metadata"].tables[relationship_table_name]
+    relationship_select_statement = SELECT_STATEMENT(relationship_table)
+    relationship_select_statement = relationship_select_statement.where(relationship_table.c.id == relationship_type_id)
+
+    # Run the query
+    relationship_select_result = connection_details["session"].execute(relationship_select_statement).all()
+    if len(relationship_select_result) == 0:
+        raise Exception("Can't find relationship type with id " + str(relationship_select_result))
+    
+    return relationship_select_result[0]._mapping["name"].strip()
+
 def get_user_run_relationships(connection_details, save_dir, run_id, source_text_id):
     # Load the source text
     texts_table_name = get_complete_table_name(connection_details, "source_text")
@@ -55,19 +87,77 @@ def get_user_run_relationships(connection_details, save_dir, run_id, source_text
     if len(text_select_result) == 0:
         raise Exception("Can't find text for source id " + str(source_text_id))
     
+    # Get the paragraph text details
     source_text = text_select_result[0]._mapping["paragraph_text"]
-    print(source_text_id, source_text)
+    source_text_hash = text_select_result[0]._mapping["hashed_text"]
 
     # Extract the relationship
     relationship_table_name = get_complete_table_name(connection_details, "relationship")
     relationship_table = connection_details["metadata"].tables[relationship_table_name]
     relationship_select_statement = SELECT_STATEMENT(relationship_table)
     relationship_select_statement = relationship_select_statement.where(relationship_table.c.run_id == run_id)
 
+    all_results = []
     all_relationships = connection_details["session"].execute(relationship_select_statement).all()
     for curr_relationship in all_relationships:
-        print(curr_relationship._mapping)
-        break
+        # Extract the fields
+        src_entity_id = curr_relationship._mapping["src_entity_id"]
+        dst_entity_id = curr_relationship._mapping["dst_entity_id"]
+        relationship_type_id = curr_relationship._mapping["relationship_type_id"]
+        
+        # Get the values from the ids
+        src_text = get_entity_name(connection_details, src_entity_id)
+        dst_text = get_entity_name(connection_details, dst_entity_id)
+        relationship_type = get_relationship_type(connection_details, relationship_type_id)
+
+        # Record this dataset
+        all_results.append({
+            "doc_id" : source_text_id,
+            "title" : source_text_hash,
+            "text" : source_text,
+            "src" : src_text,
+            "dst" : dst_text,
+            "type" : relationship_type
+        })
+
+    return pd.DataFrame(all_results)
+
+DATASET_SPLIT = [0.8, 0.1, 0.1]
+def save_results(combined_df, save_dir):
+    # Create output directory if it doesn't exist
+    os.makedirs(save_dir, exist_ok=True)
+
+    # Calculate split sizes
+    total_rows = len(combined_df)
+    train_size = int(0.8 * total_rows)
+    test_size = int(0.1 * total_rows)
+    valid_size = total_rows - train_size - test_size
+
+    # Split the dataframe
+    train_df = combined_df[:train_size]
+    test_df = combined_df[train_size:train_size+test_size]
+    valid_df = combined_df[train_size+test_size:]
+
+    # Function to save dataframes to CSV files
+    def save_to_csv(data, prefix):
+        file_names = []
+        for i, chunk in enumerate(np.array_split(data, max(1, len(data) // 1000))):
+            file_name = f"{prefix}_{i}.csv"
+            chunk.to_csv(os.path.join(save_dir, file_name), index=False, sep = '\t')
+            file_names.append(file_name)
+        return file_names
+
+    # Save each split to CSV files
+    train_files = save_to_csv(train_df, 'train')
+    test_files = save_to_csv(test_df, 'test')
+    valid_files = save_to_csv(valid_df, 'valid')
+
+    # Create text files listing the CSV files for each split
+    for split_name, file_list in [('train', train_files), ('test', test_files), ('valid', valid_files)]:
+        with open(os.path.join(save_dir, f"{split_name}.txt"), 'w') as f:
+            f.write('\n'.join(file_list))
+
+    print(f"Files saved in {save_dir} directory.")
 
 def main():
     # Load the schema
@@ -77,10 +167,14 @@ def main():
     # Get all of the user runs
     save_dir = "extracted_feedback"
     all_user_runs = get_all_user_runs(connection_details)
+    dfs_to_combine = []
     for run_id, source_text_id in all_user_runs:
-        get_user_run_relationships(connection_details, save_dir, run_id, source_text_id)
-        break
+        feedback_df = get_user_run_relationships(connection_details, save_dir, run_id, source_text_id)
+        dfs_to_combine.append(feedback_df)
+    combined_df = pd.concat(dfs_to_combine)
 
+    # Save the result in the proper format
+    save_results(combined_df, args.save_dir)
     connection_details["session"].close()
 
 if __name__ == "__main__":