-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathanalyze_godot_notes_project.py
More file actions
122 lines (105 loc) · 4.53 KB
/
analyze_godot_notes_project.py
File metadata and controls
122 lines (105 loc) · 4.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/env python3
import os, re
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from collections import Counter
HERE = os.path.dirname(__file__)
CORPUS = os.path.join(HERE, '..', 'data', 'godot_notes_corpus.csv')
LEX = os.path.join(HERE, '..', 'data', 'theme_lexicon.csv')
OUT = os.path.join(HERE, '..', 'output')
os.makedirs(OUT, exist_ok=True)
df = pd.read_csv(CORPUS)
lex = pd.read_csv(LEX)
def tokenize(t):
t = t.lower()
t = re.sub(r"[^a-z0-9\s]", " ", t)
return [w for w in t.split() if w]
df["tokens"] = df["text"].apply(tokenize)
df["token_count"] = df["tokens"].apply(len)
df["type_token_ratio"] = df["tokens"].apply(lambda toks: len(set(toks))/max(1,len(toks)))
# naive sentiment lexicon (demo)
POS = {"jokes","pleasure","together","believe"}
NEG = {"ache","irritation","corrodes"}
def sent(toks):
p = sum(1 for w in toks if w in POS)
n = sum(1 for w in toks if w in NEG)
return (p-n)/max(1,len(toks))
df["sentiment"] = df["tokens"].apply(sent)
df["work_short"] = df["work"].str.split().str[0] # "Godot" or "Notes"
# --- Docs per work
ax = df["work_short"].value_counts().sort_index().plot(kind="bar", title="Documents per Work")
ax.set_xlabel("Work"); ax.set_ylabel("Count")
fig = ax.get_figure(); fig.tight_layout()
fig.savefig(os.path.join(OUT, "docs_per_work.png")); plt.close(fig)
# --- Average metrics per work
for col in ["token_count","type_token_ratio","sentiment"]:
agg = df.groupby("work_short")[col].mean()
ax = agg.plot(kind="bar", title=f"Average {col} by Work")
ax.set_xlabel("Work"); ax.set_ylabel(col)
fig = ax.get_figure(); fig.tight_layout()
fig.savefig(os.path.join(OUT, f"avg_{col}_by_work.png")); plt.close(fig)
# --- Theme hits (outward vs inward) by work
kw_to_theme = dict(zip(lex["keyword"], lex["theme"]))
def theme_counts(tokens):
c = Counter()
for w in tokens:
if w in kw_to_theme:
c[kw_to_theme[w]] += 1
return c
df["theme_counts"] = df["tokens"].apply(theme_counts)
rows = []
for _, row in df.iterrows():
for th in ["outward","inward"]:
rows.append({"work_short": row["work_short"], "theme": th, "hits": row["theme_counts"].get(th,0)})
theme_df = pd.DataFrame(rows)
agg_theme = theme_df.groupby(["work_short","theme"])["hits"].sum().unstack(fill_value=0)
ax = agg_theme.plot(kind="bar", title="Theme Hits by Work (Outward vs Inward)")
ax.set_xlabel("Work"); ax.set_ylabel("Hits")
fig = ax.get_figure(); fig.tight_layout()
fig.savefig(os.path.join(OUT, "theme_hits_by_work.png")); plt.close(fig)
# --- Dialogicity: dialog vs monologue
ax = df["voice"].value_counts().sort_index().plot(kind="bar", title="Dialog vs Monologue (Voice)")
ax.set_xlabel("Voice"); ax.set_ylabel("Count")
fig = ax.get_figure(); fig.tight_layout()
fig.savefig(os.path.join(OUT, "voice_distribution.png")); plt.close(fig)
# --- Top keywords per work (stopworded)
STOP = {"the","and","a","an","of","to","in","is","it","that","this","as","on","for","by","with","from","into","not","has","have","had","was","were"}
def top_words(texts):
c = Counter()
for toks in texts:
c.update([w for w in toks if w not in STOP and len(w)>2 and not w.isdigit()])
return c.most_common(15)
for w in df["work_short"].unique():
items = top_words(df[df["work_short"]==w]["tokens"].tolist())
if items:
words, counts = zip(*items)
fig = plt.figure()
plt.bar(range(len(words)), counts)
plt.xticks(range(len(words)), words, rotation=45, ha="right")
plt.title(f"Top Keywords — {w}")
plt.tight_layout()
fig.savefig(os.path.join(OUT, f"top_keywords_{w}.png"))
plt.close(fig)
# --- Co-occurrence network (global, window=doc)
G = nx.Graph()
for toks in df["tokens"]:
uniq = list(sorted(set([w for w in toks if w not in STOP and len(w)>2])))
for i in range(len(uniq)):
for j in range(i+1,len(uniq)):
a,b = uniq[i], uniq[j]
if G.has_edge(a,b):
G[a][b]["weight"] += 1
else:
G.add_edge(a,b, weight=1)
pos = nx.spring_layout(G, seed=23)
plt.figure()
nx.draw_networkx_nodes(G, pos)
nx.draw_networkx_edges(G, pos, width=[G[u][v]["weight"] for u,v in G.edges()])
nx.draw_networkx_labels(G, pos, font_size=7)
plt.axis("off"); plt.tight_layout()
plt.savefig(os.path.join(OUT, "cooccurrence_network.png")); plt.close()
# export summary
summary = df[["id","work","year","voice","token_count","type_token_ratio","sentiment"]]
summary.to_csv(os.path.join(OUT, "summary_metrics.csv"), index=False)
print("Analysis complete. Outputs written to output/.")