forked from GenomicsStandardsConsortium/mixs
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextension_distances.py
More file actions
60 lines (45 loc) · 1.89 KB
/
extension_distances.py
File metadata and controls
60 lines (45 loc) · 1.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import click
import pandas as pd
from scipy.spatial.distance import pdist, squareform
from scipy.cluster import hierarchy
import matplotlib.pyplot as plt
from linkml_runtime import SchemaView
@click.command()
@click.option('--schema', '-s',
default='src/mixs/schema/mixs.yaml',
required=True,
help='Path to the schema file')
@click.option('--output', '-o', default='dendrogram.pdf',
help='Output file name for the dendrogram plot (default: dendrogram.pdf)')
def generate_dendrogram(schema, output):
schema_view = SchemaView(schema)
extension_class_names = schema_view.class_descendants('Extension')
checklist_class_names = schema_view.class_descendants('Checklist')
lod = []
for current_extension in extension_class_names:
if current_extension in checklist_class_names:
continue
extension_obj = schema_view.induced_class(current_extension)
extension_slots = list(extension_obj.attributes.keys())
for current_slot in extension_slots:
temp_dict = {
"extension": current_extension,
"slot": current_slot
}
lod.append(temp_dict)
df = pd.DataFrame(lod)
pivot_df = df.pivot(index='extension', columns='slot', values='slot').notna()
dist_matrix = pdist(pivot_df.values, metric='euclidean')
dist_matrix_square = squareform(dist_matrix)
linkage_matrix = hierarchy.linkage(dist_matrix, method='complete')
plt.figure(figsize=(14, 8))
dendrogram = hierarchy.dendrogram(linkage_matrix, labels=pivot_df.index.values, orientation='top')
plt.title('Similarity of MIxS Extensions by Term Usage')
plt.ylabel('Distance')
plt.xlabel('Extensions')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig(output, format='pdf')
plt.show()
if __name__ == '__main__':
generate_dendrogram()