Skip to content

Commit 65b74c2

Browse files
committed
update paths
1 parent e493b3c commit 65b74c2

1 file changed

Lines changed: 75 additions & 43 deletions

File tree

results_user_community.ipynb

Lines changed: 75 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,8 @@
107107
"BASE_DIR = Path(\".\").resolve()\n",
108108
"ANALYSIS_DIR = BASE_DIR / \"analysis_out\"\n",
109109
"DATA_DIR = BASE_DIR / \"data\"\n",
110+
"RAW_DIR = DATA_DIR / \"raw\"\n",
111+
"MODELS_DIR = DATA_DIR / \"models\"\n",
110112
"\n",
111113
"# Derived outputs (same directory you used before)\n",
112114
"DERIVED_DIR = BASE_DIR / \"derived\"\n",
@@ -256,9 +258,9 @@
256258
],
257259
"source": [
258260
"# Raw inputs \n",
259-
"comments_path = (BASE_DIR / \"../../big/youtube_comments.tsv.gz\").resolve()\n",
260-
"meta_path = (BASE_DIR / \"../../helper/yt_metadata_helper.feather\").resolve()\n",
261-
"channels_path = (BASE_DIR / \"../../processed/df_channels_en.tsv.gz\").resolve()\n",
261+
"comments_path = (RAW_DIR / \"youtube_comments.tsv.gz\").resolve()\n",
262+
"meta_path = (RAW_DIR / \"yt_metadata_helper.feather\").resolve()\n",
263+
"channels_path = (RAW_DIR / \"df_channels_en.tsv.gz\").resolve()\n",
262264
"\n",
263265
"# Optional: quick sanity checks / peek at helper metadata (lightweight)\n",
264266
"if meta_path.exists():\n",
@@ -2310,8 +2312,8 @@
23102312
"# ### 2.2.1 Building the groups\n",
23112313
"\n",
23122314
"author_channel_path = author_channel_out\n",
2313-
"author_groups_path = \"../Dataset/custom_3/author_groups.tsv.gz\"\n",
2314-
"groups_channelid_numc_path = \"../Dataset/custom_3/groups_channelid_numc.tsv.gz\"\n",
2315+
"author_groups_path = (MODELS_DIR / \"author_groups.tsv.gz\").resolve()\n",
2316+
"groups_channelid_numc_path = (MODELS_DIR / \"groups_channelid_numc.tsv.gz\").resolve()\n",
23152317
"\n",
23162318
"build_author_groups_and_group_channels(\n",
23172319
" path=author_channel_path,\n",
@@ -2349,12 +2351,13 @@
23492351
"metadata": {},
23502352
"outputs": [],
23512353
"source": [
2354+
"out_path = (MODELS_DIR / \"groups_num_authors.tsv.gz\").resolve()\n",
23522355
"compute_groups_num_authors(\n",
2353-
" author_groups_path=\"../Dataset/custom_3/author_groups.tsv.gz\",\n",
2354-
" out_path=\"../Dataset/custom_3/groups_num_authors.tsv.gz\",\n",
2356+
" author_groups_path=author_groups_path,\n",
2357+
" out_path=out_path,\n",
23552358
")\n",
23562359
"\n",
2357-
"groups_nauthor = pd.read_csv(\"../Dataset/custom_3/groups_num_authors.tsv.gz\", sep=\"\\t\", compression=\"infer\")\n",
2360+
"groups_nauthor = pd.read_csv(out_path, sep=\"\\t\", compression=\"infer\")\n",
23582361
"groups_nauthor.head()\n"
23592362
]
23602363
},
@@ -2377,15 +2380,15 @@
23772380
"import csv\n",
23782381
"from tqdm.notebook import tqdm\n",
23792382
"\n",
2380-
"input_path = \"../Dataset/custom_3/groups_channelid_numc.tsv.gz\"\n",
2381-
"output_path = \"../Dataset/custom_3/groups_num_channels.tsv.gz\"\n",
2383+
"input_path = (MODELS_DIR / \"groups_channelid_numc.tsv.gz\").resolve()\n",
2384+
"output_path = (MODELS_DIR / \"groups_num_channels.tsv.gz\").resolve()\n",
23822385
"\n",
23832386
"total_lines = 3_509_000_000\n",
23842387
"\n",
23852388
"\n",
23862389
"compute_groups_num_channels(\n",
2387-
" groups_channelid_numc_path=\"../Dataset/custom_3/groups_channelid_numc.tsv.gz\",\n",
2388-
" out_path=\"../Dataset/custom_3/groups_num_channels.tsv.gz\",\n",
2390+
" groups_channelid_numc_path=input_path,\n",
2391+
" out_path=output_path,\n",
23892392
" total_lines=3_509_000_000, # or None if you don't care about tqdm\n",
23902393
")\n",
23912394
"\n"
@@ -2418,12 +2421,12 @@
24182421
"import csv\n",
24192422
"from tqdm import tqdm\n",
24202423
"\n",
2421-
"input_path = \"../Dataset/custom_3/groups_channelid_numc.tsv.gz\"\n",
2422-
"output_path = \"../Dataset/custom_3/groups_total_comments.tsv.gz\"\n",
2424+
"input_path = (MODELS_DIR / \"groups_channelid_numc.tsv.gz\").resolve()\n",
2425+
"output_path = (MODELS_DIR / \"groups_total_comments.tsv.gz\").resolve()\n",
24232426
"\n",
24242427
"compute_groups_total_comments(\n",
2425-
" groups_channelid_numc_path=\"../Dataset/custom_3/groups_channelid_numc.tsv.gz\",\n",
2426-
" out_path=\"../Dataset/custom_3/groups_total_comments.tsv.gz\",\n",
2428+
" groups_channelid_numc_path=input_path,\n",
2429+
" out_path=output_path,\n",
24272430
" total_lines=3_509_000_000,\n",
24282431
")\n"
24292432
]
@@ -2462,7 +2465,7 @@
24622465
},
24632466
{
24642467
"cell_type": "code",
2465-
"execution_count": 2,
2468+
"execution_count": null,
24662469
"id": "d6745f51af2dc3bc",
24672470
"metadata": {
24682471
"ExecuteTime": {
@@ -2486,7 +2489,9 @@
24862489
"import pandas as pd\n",
24872490
"import matplotlib.pyplot as plt\n",
24882491
"\n",
2489-
"groups_nauthor = pd.read_csv(\"../Dataset/custom_3/groups_num_authors.tsv.gz\", sep=\"\\t\", compression=\"infer\")\n",
2492+
"groups_nauthor_path = (MODELS_DIR / \"groups_num_authors.tsv.gz\").resolve()\n",
2493+
"\n",
2494+
"groups_nauthor = pd.read_csv(groups_nauthor_path, sep=\"\\t\", compression=\"infer\")\n",
24902495
"\n",
24912496
"plt.figure(figsize=(8, 5))\n",
24922497
"plt.hist(groups_nauthor[\"num_authors\"], log=True, bins=50)\n",
@@ -2498,7 +2503,7 @@
24982503
},
24992504
{
25002505
"cell_type": "code",
2501-
"execution_count": 3,
2506+
"execution_count": null,
25022507
"id": "f7081af991c846c8",
25032508
"metadata": {
25042509
"ExecuteTime": {
@@ -2522,8 +2527,10 @@
25222527
"import pandas as pd\n",
25232528
"import matplotlib.pyplot as plt\n",
25242529
"\n",
2530+
"groups_total_comments_path = (MODELS_DIR / \"groups_total_comments.tsv.gz\").resolve()\n",
2531+
"\n",
25252532
"total_c = pd.read_csv(\n",
2526-
" \"../Dataset/custom_3/groups_total_comments.tsv.gz\",\n",
2533+
" groups_total_comments_path,\n",
25272534
" sep=\"\\t\",\n",
25282535
" compression=\"infer\"\n",
25292536
")\n",
@@ -2538,7 +2545,7 @@
25382545
},
25392546
{
25402547
"cell_type": "code",
2541-
"execution_count": 4,
2548+
"execution_count": null,
25422549
"id": "f0b90b12bc375726",
25432550
"metadata": {
25442551
"ExecuteTime": {
@@ -2562,8 +2569,9 @@
25622569
"import pandas as pd\n",
25632570
"import matplotlib.pyplot as plt\n",
25642571
"\n",
2572+
"groups_num_channels_path = (MODELS_DIR / \"groups_num_channels.tsv.gz\").resolve()\n",
25652573
"n_channels = pd.read_csv(\n",
2566-
" \"../Dataset/custom_3/groups_num_channels.tsv.gz\",\n",
2574+
" groups_num_channels_path,\n",
25672575
" sep=\"\\t\",\n",
25682576
" compression=\"infer\"\n",
25692577
")\n",
@@ -2601,10 +2609,14 @@
26012609
"source": [
26022610
"OUT_DIR = \"web/public/data\"\n",
26032611
"\n",
2612+
"gna_path = (MODELS_DIR / \"groups_num_authors.tsv.gz\").resolve()\n",
2613+
"gnc_path = (MODELS_DIR / \"groups_num_channels.tsv.gz\").resolve()\n",
2614+
"gtc_path = (MODELS_DIR / \"groups_total_comments.tsv.gz\").resolve()\n",
2615+
"\n",
26042616
"export_group_histograms(\n",
2605-
" groups_num_authors_path=\"../Dataset/custom_3/groups_num_authors.tsv.gz\",\n",
2606-
" groups_total_comments_path=\"../Dataset/custom_3/groups_total_comments.tsv.gz\",\n",
2607-
" groups_num_channels_path=\"../Dataset/custom_3/groups_num_channels.tsv.gz\",\n",
2617+
" groups_num_authors_path=gna_path,\n",
2618+
" groups_total_comments_path=gtc_path,\n",
2619+
" groups_num_channels_path=gnc_path,\n",
26082620
" out_dir=OUT_DIR,\n",
26092621
")\n"
26102622
]
@@ -2667,11 +2679,15 @@
26672679
"metadata": {},
26682680
"outputs": [],
26692681
"source": [
2682+
"groups_path = (MODELS_DIR / \"groups_channelid_numc.tsv.gz\").resolve()\n",
2683+
"channels_path = (RAW_DIR / \"df_channels_en.tsv.gz\").resolve()\n",
2684+
"out_path = (MODELS_DIR / \"groups_features.tsv.gz\").resolve()\n",
2685+
"\n",
26702686
"build_group_features(\n",
2671-
" groups_path=\"../Dataset/custom_3/groups_channelid_numc.tsv.gz\",\n",
2672-
" channels_path=\"../Dataset/original/df_channels.tsv\",\n",
2673-
" out_path=\"../Dataset/custom_3/groups_features.tsv.gz\",\n",
2674-
" total_rows=3_239_637_783, # whatever you used as total_rows\n",
2687+
" groups_path=groups_path,\n",
2688+
" channels_path=channels_path,\n",
2689+
" out_path=out_path,\n",
2690+
" total_rows=3_239_637_783, # whatever we used as total_rows\n",
26752691
")\n"
26762692
]
26772693
},
@@ -2685,7 +2701,7 @@
26852701
},
26862702
{
26872703
"cell_type": "code",
2688-
"execution_count": 6,
2704+
"execution_count": null,
26892705
"id": "f4d29261dbf88949",
26902706
"metadata": {
26912707
"ExecuteTime": {
@@ -2729,7 +2745,9 @@
27292745
"import pandas as pd\n",
27302746
"import matplotlib.pyplot as plt\n",
27312747
"\n",
2732-
"feat = pd.read_csv(\"../Dataset/custom_3/groups_features.tsv.gz\", sep=\"\\t\", compression=\"infer\")\n",
2748+
"fp = (MODELS_DIR / \"groups_features.tsv.gz\").resolve()\n",
2749+
"\n",
2750+
"feat = pd.read_csv(fp, sep=\"\\t\", compression=\"infer\")\n",
27332751
"\n",
27342752
"feat = feat[(feat[\"total_comments\"] > 10) & (feat[\"num_channels\"] < 1000)]\n",
27352753
"\n",
@@ -2777,8 +2795,10 @@
27772795
"source": [
27782796
"OUT_DIR = \"web/public/data\"\n",
27792797
"\n",
2798+
"fp = (MODELS_DIR / \"groups_features.tsv.gz\").resolve()\n",
2799+
"\n",
27802800
"export_feature_distributions(\n",
2781-
" features_path=\"../Dataset/custom_3/groups_features.tsv.gz\",\n",
2801+
" features_path=fp,\n",
27822802
" out_dir=OUT_DIR,\n",
27832803
" min_tc=10,\n",
27842804
" max_tc=1000,\n",
@@ -2800,7 +2820,9 @@
28002820
"OUT_DIR = \"web/public/data\"\n",
28012821
"os.makedirs(OUT_DIR, exist_ok=True)\n",
28022822
"\n",
2803-
"feat = pd.read_csv(\"../Dataset/custom_3/groups_features.tsv.gz\", sep=\"\\t\", compression=\"infer\")\n",
2823+
"fp = (MODELS_DIR / \"groups_features.tsv.gz\").resolve()\n",
2824+
"\n",
2825+
"feat = pd.read_csv(fp, sep=\"\\t\", compression=\"infer\")\n",
28042826
"for c in [\"total_comments\",\"num_channels\",\"fidelity\",\"category_entropy\"]:\n",
28052827
" feat[c] = pd.to_numeric(feat[c], errors=\"coerce\")\n",
28062828
"\n",
@@ -2866,8 +2888,8 @@
28662888
"metadata": {},
28672889
"outputs": [],
28682890
"source": [
2869-
"features_path = \"../Dataset/custom_3/groups_features.tsv.gz\"\n",
2870-
"clusters_path = \"../Dataset/custom_3/groups_kmeans_10.tsv.gz\"\n",
2891+
"features_path = (MODELS_DIR / \"groups_features.tsv.gz\").resolve()\n",
2892+
"clusters_path = (MODELS_DIR / \"groups_kmeans_10.tsv.gz\").resolve()\n",
28712893
"\n",
28722894
"run_kmeans_on_groups(\n",
28732895
" features_path=features_path,\n",
@@ -2885,10 +2907,14 @@
28852907
"metadata": {},
28862908
"outputs": [],
28872909
"source": [
2910+
"authors_path = (MODELS_DIR / \"groups_num_authors.tsv.gz\").resolve()\n",
2911+
"clusters_path = (MODELS_DIR / \"groups_kmeans_10.tsv.gz\").resolve()\n",
2912+
"out_path = (MODELS_DIR / \"kmeans_cluster_people_share.tsv\").resolve()\n",
2913+
"\n",
28882914
"summarize_kmeans_cluster_people_share(\n",
28892915
" clusters_path=clusters_path,\n",
2890-
" authors_path=\"../Dataset/custom_3/groups_num_authors.tsv.gz\",\n",
2891-
" out_path=\"../Dataset/custom_3/kmeans_cluster_people_share.tsv\",\n",
2916+
" authors_path=authors_path,\n",
2917+
" out_path=out_path,\n",
28922918
")\n"
28932919
]
28942920
},
@@ -2907,9 +2933,11 @@
29072933
"metadata": {},
29082934
"outputs": [],
29092935
"source": [
2936+
"features_path = (MODELS_DIR / \"groups_features.tsv.gz\").resolve()\n",
2937+
"clusters_path = (MODELS_DIR / \"groups_kmeans_15_1000.tsv.gz\").resolve()\n",
29102938
"plot_kmeans_clusters(\n",
2911-
" features_path=\"../Dataset/custom_3/groups_features.tsv.gz\",\n",
2912-
" clusters_path=\"../Dataset/custom_3/groups_kmeans_15_1000.tsv.gz\",\n",
2939+
" features_path=features_path,\n",
2940+
" clusters_path=clusters_path,\n",
29132941
" min_tc=10,\n",
29142942
" max_tc=1000,\n",
29152943
" max_points=300_000,\n",
@@ -2936,8 +2964,9 @@
29362964
},
29372965
"outputs": [],
29382966
"source": [
2967+
"\n",
29392968
"export_kmeans_explorer_assets(\n",
2940-
" features_path=\"../Dataset/custom_3/groups_features.tsv.gz\",\n",
2969+
" features_path=features_path,\n",
29412970
" out_dir=\"web/public/data/kmeans_explorer\",\n",
29422971
" min_tc=10,\n",
29432972
" max_tc=1000,\n",
@@ -3007,10 +3036,13 @@
30073036
"metadata": {},
30083037
"outputs": [],
30093038
"source": [
3039+
"gp = (MODELS_DIR / \"groups_channelid_numc.tsv.gz\").resolve()\n",
3040+
"ch = (RAW_DIR / \"df_channels_en.tsv.gz\").resolve()\n",
3041+
"out = (MODELS_DIR / \"groups_category_numc.tsv.gz\").resolve()\n",
30103042
"category_analysis_for_groups(\n",
3011-
" groups_path=\"../Dataset/custom_3/groups_channelid_numc.tsv.gz\",\n",
3012-
" channels_path=\"../Dataset/original/df_channels.tsv\",\n",
3013-
" out_path=\"../Dataset/custom_3/groups_category_numc.tsv.gz\",\n",
3043+
" groups_path=gp,\n",
3044+
" channels_path=ch,\n",
3045+
" out_path=out,\n",
30143046
" total_lines=3_509_000_000,\n",
30153047
")\n"
30163048
]

0 commit comments

Comments
 (0)