update paths

SltMatteo · SltMatteo · commit 65b74c2cd47d · 2025-12-21T21:03:38.000+01:00
diff --git a/results_user_community.ipynb b/results_user_community.ipynb
@@ -107,6 +107,8 @@
     "BASE_DIR = Path(\".\").resolve()\n",
     "ANALYSIS_DIR = BASE_DIR / \"analysis_out\"\n",
     "DATA_DIR = BASE_DIR / \"data\"\n",
+    "RAW_DIR = DATA_DIR / \"raw\"\n",
+    "MODELS_DIR = DATA_DIR / \"models\"\n",
     "\n",
     "# Derived outputs (same directory you used before)\n",
     "DERIVED_DIR = BASE_DIR / \"derived\"\n",
@@ -256,9 +258,9 @@
    ],
    "source": [
     "# Raw inputs \n",
-    "comments_path = (BASE_DIR / \"../../big/youtube_comments.tsv.gz\").resolve()\n",
-    "meta_path     = (BASE_DIR / \"../../helper/yt_metadata_helper.feather\").resolve()\n",
-    "channels_path = (BASE_DIR / \"../../processed/df_channels_en.tsv.gz\").resolve()\n",
+    "comments_path = (RAW_DIR / \"youtube_comments.tsv.gz\").resolve()\n",
+    "meta_path     = (RAW_DIR / \"yt_metadata_helper.feather\").resolve()\n",
+    "channels_path = (RAW_DIR / \"df_channels_en.tsv.gz\").resolve()\n",
     "\n",
     "# Optional: quick sanity checks / peek at helper metadata (lightweight)\n",
     "if meta_path.exists():\n",
@@ -2310,8 +2312,8 @@
     "# ### 2.2.1 Building the groups\n",
     "\n",
     "author_channel_path = author_channel_out\n",
-    "author_groups_path = \"../Dataset/custom_3/author_groups.tsv.gz\"\n",
-    "groups_channelid_numc_path = \"../Dataset/custom_3/groups_channelid_numc.tsv.gz\"\n",
+    "author_groups_path = (MODELS_DIR / \"author_groups.tsv.gz\").resolve()\n",
+    "groups_channelid_numc_path = (MODELS_DIR / \"groups_channelid_numc.tsv.gz\").resolve()\n",
     "\n",
     "build_author_groups_and_group_channels(\n",
     "    path=author_channel_path,\n",
@@ -2349,12 +2351,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "out_path = (MODELS_DIR / \"groups_num_authors.tsv.gz\").resolve()\n",
     "compute_groups_num_authors(\n",
-    "    author_groups_path=\"../Dataset/custom_3/author_groups.tsv.gz\",\n",
-    "    out_path=\"../Dataset/custom_3/groups_num_authors.tsv.gz\",\n",
+    "    author_groups_path=author_groups_path,\n",
+    "    out_path=out_path,\n",
     ")\n",
     "\n",
-    "groups_nauthor = pd.read_csv(\"../Dataset/custom_3/groups_num_authors.tsv.gz\", sep=\"\\t\", compression=\"infer\")\n",
+    "groups_nauthor = pd.read_csv(out_path, sep=\"\\t\", compression=\"infer\")\n",
     "groups_nauthor.head()\n"
    ]
   },
@@ -2377,15 +2380,15 @@
     "import csv\n",
     "from tqdm.notebook import tqdm\n",
     "\n",
-    "input_path  = \"../Dataset/custom_3/groups_channelid_numc.tsv.gz\"\n",
-    "output_path = \"../Dataset/custom_3/groups_num_channels.tsv.gz\"\n",
+    "input_path  = (MODELS_DIR / \"groups_channelid_numc.tsv.gz\").resolve()\n",
+    "output_path = (MODELS_DIR / \"groups_num_channels.tsv.gz\").resolve()\n",
     "\n",
     "total_lines = 3_509_000_000\n",
     "\n",
     "\n",
     "compute_groups_num_channels(\n",
-    "    groups_channelid_numc_path=\"../Dataset/custom_3/groups_channelid_numc.tsv.gz\",\n",
-    "    out_path=\"../Dataset/custom_3/groups_num_channels.tsv.gz\",\n",
+    "    groups_channelid_numc_path=input_path,\n",
+    "    out_path=output_path,\n",
     "    total_lines=3_509_000_000,  # or None if you don't care about tqdm\n",
     ")\n",
     "\n"
@@ -2418,12 +2421,12 @@
     "import csv\n",
     "from tqdm import tqdm\n",
     "\n",
-    "input_path  = \"../Dataset/custom_3/groups_channelid_numc.tsv.gz\"\n",
-    "output_path = \"../Dataset/custom_3/groups_total_comments.tsv.gz\"\n",
+    "input_path  = (MODELS_DIR / \"groups_channelid_numc.tsv.gz\").resolve()\n",
+    "output_path = (MODELS_DIR / \"groups_total_comments.tsv.gz\").resolve()\n",
     "\n",
     "compute_groups_total_comments(\n",
-    "    groups_channelid_numc_path=\"../Dataset/custom_3/groups_channelid_numc.tsv.gz\",\n",
-    "    out_path=\"../Dataset/custom_3/groups_total_comments.tsv.gz\",\n",
+    "    groups_channelid_numc_path=input_path,\n",
+    "    out_path=output_path,\n",
     "    total_lines=3_509_000_000,\n",
     ")\n"
    ]
@@ -2462,7 +2465,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "id": "d6745f51af2dc3bc",
    "metadata": {
     "ExecuteTime": {
@@ -2486,7 +2489,9 @@
     "import pandas as pd\n",
     "import matplotlib.pyplot as plt\n",
     "\n",
-    "groups_nauthor = pd.read_csv(\"../Dataset/custom_3/groups_num_authors.tsv.gz\", sep=\"\\t\", compression=\"infer\")\n",
+    "groups_nauthor_path = (MODELS_DIR / \"groups_num_authors.tsv.gz\").resolve()\n",
+    "\n",
+    "groups_nauthor = pd.read_csv(groups_nauthor_path, sep=\"\\t\", compression=\"infer\")\n",
     "\n",
     "plt.figure(figsize=(8, 5))\n",
     "plt.hist(groups_nauthor[\"num_authors\"], log=True, bins=50)\n",
@@ -2498,7 +2503,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "id": "f7081af991c846c8",
    "metadata": {
     "ExecuteTime": {
@@ -2522,8 +2527,10 @@
     "import pandas as pd\n",
     "import matplotlib.pyplot as plt\n",
     "\n",
+    "groups_total_comments_path = (MODELS_DIR / \"groups_total_comments.tsv.gz\").resolve()\n",
+    "\n",
     "total_c = pd.read_csv(\n",
-    "    \"../Dataset/custom_3/groups_total_comments.tsv.gz\",\n",
+    "    groups_total_comments_path,\n",
     "    sep=\"\\t\",\n",
     "    compression=\"infer\"\n",
     ")\n",
@@ -2538,7 +2545,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "id": "f0b90b12bc375726",
    "metadata": {
     "ExecuteTime": {
@@ -2562,8 +2569,9 @@
     "import pandas as pd\n",
     "import matplotlib.pyplot as plt\n",
     "\n",
+    "groups_num_channels_path = (MODELS_DIR / \"groups_num_channels.tsv.gz\").resolve()\n",
     "n_channels = pd.read_csv(\n",
-    "    \"../Dataset/custom_3/groups_num_channels.tsv.gz\",\n",
+    "    groups_num_channels_path,\n",
     "    sep=\"\\t\",\n",
     "    compression=\"infer\"\n",
     ")\n",
@@ -2601,10 +2609,14 @@
    "source": [
     "OUT_DIR = \"web/public/data\"\n",
     "\n",
+    "gna_path = (MODELS_DIR / \"groups_num_authors.tsv.gz\").resolve()\n",
+    "gnc_path = (MODELS_DIR / \"groups_num_channels.tsv.gz\").resolve()\n",
+    "gtc_path = (MODELS_DIR / \"groups_total_comments.tsv.gz\").resolve()\n",
+    "\n",
     "export_group_histograms(\n",
-    "    groups_num_authors_path=\"../Dataset/custom_3/groups_num_authors.tsv.gz\",\n",
-    "    groups_total_comments_path=\"../Dataset/custom_3/groups_total_comments.tsv.gz\",\n",
-    "    groups_num_channels_path=\"../Dataset/custom_3/groups_num_channels.tsv.gz\",\n",
+    "    groups_num_authors_path=gna_path,\n",
+    "    groups_total_comments_path=gtc_path,\n",
+    "    groups_num_channels_path=gnc_path,\n",
     "    out_dir=OUT_DIR,\n",
     ")\n"
    ]
@@ -2667,11 +2679,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "groups_path = (MODELS_DIR / \"groups_channelid_numc.tsv.gz\").resolve()\n",
+    "channels_path = (RAW_DIR / \"df_channels_en.tsv.gz\").resolve()\n",
+    "out_path = (MODELS_DIR / \"groups_features.tsv.gz\").resolve()\n",
+    "\n",
     "build_group_features(\n",
-    "    groups_path=\"../Dataset/custom_3/groups_channelid_numc.tsv.gz\",\n",
-    "    channels_path=\"../Dataset/original/df_channels.tsv\",\n",
-    "    out_path=\"../Dataset/custom_3/groups_features.tsv.gz\",\n",
-    "    total_rows=3_239_637_783,  # whatever you used as total_rows\n",
+    "    groups_path=groups_path,\n",
+    "    channels_path=channels_path,\n",
+    "    out_path=out_path,\n",
+    "    total_rows=3_239_637_783,  # whatever we used as total_rows\n",
     ")\n"
    ]
   },
@@ -2685,7 +2701,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "id": "f4d29261dbf88949",
    "metadata": {
     "ExecuteTime": {
@@ -2729,7 +2745,9 @@
     "import pandas as pd\n",
     "import matplotlib.pyplot as plt\n",
     "\n",
-    "feat = pd.read_csv(\"../Dataset/custom_3/groups_features.tsv.gz\", sep=\"\\t\", compression=\"infer\")\n",
+    "fp = (MODELS_DIR / \"groups_features.tsv.gz\").resolve()\n",
+    "\n",
+    "feat = pd.read_csv(fp, sep=\"\\t\", compression=\"infer\")\n",
     "\n",
     "feat = feat[(feat[\"total_comments\"] > 10) & (feat[\"num_channels\"] < 1000)]\n",
     "\n",
@@ -2777,8 +2795,10 @@
    "source": [
     "OUT_DIR = \"web/public/data\"\n",
     "\n",
+    "fp = (MODELS_DIR / \"groups_features.tsv.gz\").resolve()\n",
+    "\n",
     "export_feature_distributions(\n",
-    "    features_path=\"../Dataset/custom_3/groups_features.tsv.gz\",\n",
+    "    features_path=fp,\n",
     "    out_dir=OUT_DIR,\n",
     "    min_tc=10,\n",
     "    max_tc=1000,\n",
@@ -2800,7 +2820,9 @@
     "OUT_DIR = \"web/public/data\"\n",
     "os.makedirs(OUT_DIR, exist_ok=True)\n",
     "\n",
-    "feat = pd.read_csv(\"../Dataset/custom_3/groups_features.tsv.gz\", sep=\"\\t\", compression=\"infer\")\n",
+    "fp = (MODELS_DIR / \"groups_features.tsv.gz\").resolve()\n",
+    "\n",
+    "feat = pd.read_csv(fp, sep=\"\\t\", compression=\"infer\")\n",
     "for c in [\"total_comments\",\"num_channels\",\"fidelity\",\"category_entropy\"]:\n",
     "    feat[c] = pd.to_numeric(feat[c], errors=\"coerce\")\n",
     "\n",
@@ -2866,8 +2888,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "features_path = \"../Dataset/custom_3/groups_features.tsv.gz\"\n",
-    "clusters_path = \"../Dataset/custom_3/groups_kmeans_10.tsv.gz\"\n",
+    "features_path = (MODELS_DIR / \"groups_features.tsv.gz\").resolve()\n",
+    "clusters_path = (MODELS_DIR / \"groups_kmeans_10.tsv.gz\").resolve()\n",
     "\n",
     "run_kmeans_on_groups(\n",
     "    features_path=features_path,\n",
@@ -2885,10 +2907,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "authors_path = (MODELS_DIR / \"groups_num_authors.tsv.gz\").resolve()\n",
+    "clusters_path = (MODELS_DIR / \"groups_kmeans_10.tsv.gz\").resolve()\n",
+    "out_path = (MODELS_DIR / \"kmeans_cluster_people_share.tsv\").resolve()\n",
+    "\n",
     "summarize_kmeans_cluster_people_share(\n",
     "    clusters_path=clusters_path,\n",
-    "    authors_path=\"../Dataset/custom_3/groups_num_authors.tsv.gz\",\n",
-    "    out_path=\"../Dataset/custom_3/kmeans_cluster_people_share.tsv\",\n",
+    "    authors_path=authors_path,\n",
+    "    out_path=out_path,\n",
     ")\n"
    ]
   },
@@ -2907,9 +2933,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "features_path = (MODELS_DIR / \"groups_features.tsv.gz\").resolve()\n",
+    "clusters_path = (MODELS_DIR / \"groups_kmeans_15_1000.tsv.gz\").resolve()\n",
     "plot_kmeans_clusters(\n",
-    "    features_path=\"../Dataset/custom_3/groups_features.tsv.gz\",\n",
-    "    clusters_path=\"../Dataset/custom_3/groups_kmeans_15_1000.tsv.gz\",\n",
+    "    features_path=features_path,\n",
+    "    clusters_path=clusters_path,\n",
     "    min_tc=10,\n",
     "    max_tc=1000,\n",
     "    max_points=300_000,\n",
@@ -2936,8 +2964,9 @@
    },
    "outputs": [],
    "source": [
+    "\n",
     "export_kmeans_explorer_assets(\n",
-    "    features_path=\"../Dataset/custom_3/groups_features.tsv.gz\",\n",
+    "    features_path=features_path,\n",
     "    out_dir=\"web/public/data/kmeans_explorer\",\n",
     "    min_tc=10,\n",
     "    max_tc=1000,\n",
@@ -3007,10 +3036,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "gp = (MODELS_DIR / \"groups_channelid_numc.tsv.gz\").resolve()\n",
+    "ch = (RAW_DIR / \"df_channels_en.tsv.gz\").resolve()\n",
+    "out = (MODELS_DIR / \"groups_category_numc.tsv.gz\").resolve()\n",
     "category_analysis_for_groups(\n",
-    "    groups_path=\"../Dataset/custom_3/groups_channelid_numc.tsv.gz\",\n",
-    "    channels_path=\"../Dataset/original/df_channels.tsv\",\n",
-    "    out_path=\"../Dataset/custom_3/groups_category_numc.tsv.gz\",\n",
+    "    groups_path=gp,\n",
+    "    channels_path=ch,\n",
+    "    out_path=out,\n",
     "    total_lines=3_509_000_000,\n",
     ")\n"
    ]