Two-Faces-of-the-Void-A-Digital-Humanities-Exploration-of-Beckett-s-Waiting-and-Dostoevsky-s-Despair/analyze_godot_notes_project.py at main · MirShabnam/Two-Faces-of-the-Void-A-Digital-Humanities-Exploration-of-Beckett-s-Waiting-and-Dostoevsky-s-Despair · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/env python3
import os, re
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from collections import Counter

HERE = os.path.dirname(__file__)
CORPUS = os.path.join(HERE, '..', 'data', 'godot_notes_corpus.csv')
LEX = os.path.join(HERE, '..', 'data', 'theme_lexicon.csv')
OUT = os.path.join(HERE, '..', 'output')
os.makedirs(OUT, exist_ok=True)

df = pd.read_csv(CORPUS)
lex = pd.read_csv(LEX)

def tokenize(t):
    t = t.lower()
    t = re.sub(r"[^a-z0-9\s]", " ", t)
    return [w for w in t.split() if w]

df["tokens"] = df["text"].apply(tokenize)
df["token_count"] = df["tokens"].apply(len)
df["type_token_ratio"] = df["tokens"].apply(lambda toks: len(set(toks))/max(1,len(toks)))

# naive sentiment lexicon (demo)
POS = {"jokes","pleasure","together","believe"}
NEG = {"ache","irritation","corrodes"}
def sent(toks):
    p = sum(1 for w in toks if w in POS)
    n = sum(1 for w in toks if w in NEG)
    return (p-n)/max(1,len(toks))
df["sentiment"] = df["tokens"].apply(sent)

df["work_short"] = df["work"].str.split().str[0]  # "Godot" or "Notes"

# --- Docs per work
ax = df["work_short"].value_counts().sort_index().plot(kind="bar", title="Documents per Work")
ax.set_xlabel("Work"); ax.set_ylabel("Count")
fig = ax.get_figure(); fig.tight_layout()
fig.savefig(os.path.join(OUT, "docs_per_work.png")); plt.close(fig)

# --- Average metrics per work
for col in ["token_count","type_token_ratio","sentiment"]:
    agg = df.groupby("work_short")[col].mean()
    ax = agg.plot(kind="bar", title=f"Average {col} by Work")
    ax.set_xlabel("Work"); ax.set_ylabel(col)
    fig = ax.get_figure(); fig.tight_layout()
    fig.savefig(os.path.join(OUT, f"avg_{col}_by_work.png")); plt.close(fig)

# --- Theme hits (outward vs inward) by work
kw_to_theme = dict(zip(lex["keyword"], lex["theme"]))
def theme_counts(tokens):
    c = Counter()
    for w in tokens:
        if w in kw_to_theme:
            c[kw_to_theme[w]] += 1
    return c

df["theme_counts"] = df["tokens"].apply(theme_counts)
rows = []
for _, row in df.iterrows():
    for th in ["outward","inward"]:
        rows.append({"work_short": row["work_short"], "theme": th, "hits": row["theme_counts"].get(th,0)})
theme_df = pd.DataFrame(rows)
agg_theme = theme_df.groupby(["work_short","theme"])["hits"].sum().unstack(fill_value=0)

ax = agg_theme.plot(kind="bar", title="Theme Hits by Work (Outward vs Inward)")
ax.set_xlabel("Work"); ax.set_ylabel("Hits")
fig = ax.get_figure(); fig.tight_layout()
fig.savefig(os.path.join(OUT, "theme_hits_by_work.png")); plt.close(fig)

# --- Dialogicity: dialog vs monologue
ax = df["voice"].value_counts().sort_index().plot(kind="bar", title="Dialog vs Monologue (Voice)")
ax.set_xlabel("Voice"); ax.set_ylabel("Count")
fig = ax.get_figure(); fig.tight_layout()
fig.savefig(os.path.join(OUT, "voice_distribution.png")); plt.close(fig)

# --- Top keywords per work (stopworded)
STOP = {"the","and","a","an","of","to","in","is","it","that","this","as","on","for","by","with","from","into","not","has","have","had","was","were"}
def top_words(texts):
    c = Counter()
    for toks in texts:
        c.update([w for w in toks if w not in STOP and len(w)>2 and not w.isdigit()])
    return c.most_common(15)

for w in df["work_short"].unique():
    items = top_words(df[df["work_short"]==w]["tokens"].tolist())
    if items:
        words, counts = zip(*items)
        fig = plt.figure()
        plt.bar(range(len(words)), counts)
        plt.xticks(range(len(words)), words, rotation=45, ha="right")
        plt.title(f"Top Keywords — {w}")
        plt.tight_layout()
        fig.savefig(os.path.join(OUT, f"top_keywords_{w}.png"))
        plt.close(fig)

# --- Co-occurrence network (global, window=doc)
G = nx.Graph()
for toks in df["tokens"]:
    uniq = list(sorted(set([w for w in toks if w not in STOP and len(w)>2])))
    for i in range(len(uniq)):
        for j in range(i+1,len(uniq)):
            a,b = uniq[i], uniq[j]
            if G.has_edge(a,b):
                G[a][b]["weight"] += 1
            else:
                G.add_edge(a,b, weight=1)

pos = nx.spring_layout(G, seed=23)
plt.figure()
nx.draw_networkx_nodes(G, pos)
nx.draw_networkx_edges(G, pos, width=[G[u][v]["weight"] for u,v in G.edges()])
nx.draw_networkx_labels(G, pos, font_size=7)
plt.axis("off"); plt.tight_layout()
plt.savefig(os.path.join(OUT, "cooccurrence_network.png")); plt.close()

# export summary
summary = df[["id","work","year","voice","token_count","type_token_ratio","sentiment"]]
summary.to_csv(os.path.join(OUT, "summary_metrics.csv"), index=False)
print("Analysis complete. Outputs written to output/.")