Taxonomy-Analysis/edtechtaxonomyanalysis.py at main · Minaekramnia/Taxonomy-Analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# -*- coding: utf-8 -*-
"""EdTechTaxonomyAnalysis.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1YZFK2xsPUaD7sPtWgElGVvh-dIxRfhFf
"""

# libraries
import pandas as pd
from openai import OpenAI
import openai
import os

# === CONFIG ===
input_file = "Final_Taxonomy_Analysis_Model_Reasoning_All_Projects.xlsx"
output_summary_file = "Taxonomy_Activity_Recommendations_Summary.xlsx"
output_definitions_file = "Taxonomy_Updated_Definitions_Refined.xlsx"
openai.api_key = os.getenv("")

# === LOAD DATA ===
df = pd.read_excel(input_file)

# === SUMMARY STATISTICS PER ACTIVITY ===
summary = df.groupby("Activity Name").agg(
    Count=("Project", "count"),
    Removed=("Removed in Updated", "sum"),
    Added=("Added in Updated", "sum"),
    Human_Tagged=("Selected by Human", "sum"),
    Updated_Selected=("In Updated Taxonomy", "sum"),
    Old_Selected=("In Old Taxonomy", "sum"),
)

# === STRATEGIC RECOMMENDATIONS ===
def get_recommendation(row):
    if row["Human_Tagged"] > 3 and row["Removed"] > 0:
        return "Improve & Reintroduce (important but dropped)"
    elif row["Human_Tagged"] > 3 and row["Updated_Selected"] >= row["Old_Selected"]:
        return "Keep (high alignment with human and model)"
    elif row["Added"] > 0 and row["Human_Tagged"] == 0:
        return "Review or Refine (added but not validated)"
    elif row["Human_Tagged"] == 0 and row["Updated_Selected"] > 0:
        return "Over-inclusive (needs qualifiers)"
    else:
        return "Low Priority or Maintain"

summary["General Recommendation"] = summary.apply(get_recommendation, axis=1)
summary.to_excel(output_summary_file)

# === DEFINITION REFINEMENT ===
definition_df = df[[
    "Activity Name",
    "Definition (Old Taxonomy)",
    "Definition (Updated Taxonomy)",
    "Model-Based Analysis",
    "Suggested Improvement",
    "Project"
]].drop_duplicates(subset=["Activity Name"])

# Merge recommendations
definition_df = definition_df.merge(summary[["General Recommendation"]], left_on="Activity Name", right_index=True)

def refine_definition(row):
    updated_def = str(row["Definition (Updated Taxonomy)"]).strip()
    rec = row["General Recommendation"]

    if rec == "Keep (high alignment with human and model)":
        return updated_def
    elif rec == "Improve & Reintroduce (important but dropped)":
        return f"{updated_def} (Consider restoring this activity due to consistent human tagging. Add qualifiers such as 'in low-resource settings' or 'linked to measurable outcomes.')"
    elif rec == "Review or Refine (added but not validated)":
        return f"{updated_def} (Revise by including project-relevant examples or reframe to clarify scope. Ensure it matches PAD language.)"
    elif rec == "Over-inclusive (needs qualifiers)":
        return f"{updated_def} (Too broad. Add limits such as 'only when linked to formal systems' or 'excluding generic statements of support.')"
    else:
        return updated_def

definition_df["Proposed Updated Definition"] = definition_df.apply(refine_definition, axis=1)
definition_df.to_excel(output_definitions_file, index=False)

# === OPTIONAL: GPT-4o MINI MODEL INTEGRATION ===
def format_prompt(row):
    return f"""
You are evaluating how well an updated taxonomy definition aligns with real project descriptions and human tagging practices.

Activity Name: {row['Activity Name']}

Old Definition:
{row['Definition (Old Taxonomy)']}

Updated Definition:
{row['Definition (Updated Taxonomy)']}

Selected by Human: {'Yes' if row['Selected by Human'] else 'No'}
Selected by Model: {'Yes' if row['Selected by Model'] else 'No'}
Selected by Both: {'Yes' if row['Selected by Both Model and Human'] else 'No'}

Was this activity added or removed in the updated taxonomy?
- Added: {row['Added in Updated']}
- Removed: {row['Removed in Updated']}

Please analyze:
1. Whether the updated definition improved alignment with human tagging.
2. If not selected by humans, whether it seems too broad or unclear.
3. Suggest ways to improve the updated definition, if needed.
"""

# Generate project-specific filtered files like Africa_Taxonomy_Relevant_Only...
projects = df['Project'].unique()
for project in projects:
    project_df = df[df['Project'] == project].copy()
    filename = f"{project}_Taxonomy_Relevant_Only_With_Definitions_And_Both_Tagging_Sources.xlsx"
    project_df.to_excel(filename, index=False)

print("✅ All files generated, including project-specific analysis sheets.")