|
107 | 107 | "BASE_DIR = Path(\".\").resolve()\n", |
108 | 108 | "ANALYSIS_DIR = BASE_DIR / \"analysis_out\"\n", |
109 | 109 | "DATA_DIR = BASE_DIR / \"data\"\n", |
| 110 | + "RAW_DIR = DATA_DIR / \"raw\"\n", |
| 111 | + "MODELS_DIR = DATA_DIR / \"models\"\n", |
110 | 112 | "\n", |
111 | 113 | "# Derived outputs (same directory you used before)\n", |
112 | 114 | "DERIVED_DIR = BASE_DIR / \"derived\"\n", |
|
256 | 258 | ], |
257 | 259 | "source": [ |
258 | 260 | "# Raw inputs \n", |
259 | | - "comments_path = (BASE_DIR / \"../../big/youtube_comments.tsv.gz\").resolve()\n", |
260 | | - "meta_path = (BASE_DIR / \"../../helper/yt_metadata_helper.feather\").resolve()\n", |
261 | | - "channels_path = (BASE_DIR / \"../../processed/df_channels_en.tsv.gz\").resolve()\n", |
| 261 | + "comments_path = (RAW_DIR / \"youtube_comments.tsv.gz\").resolve()\n", |
| 262 | + "meta_path = (RAW_DIR / \"yt_metadata_helper.feather\").resolve()\n", |
| 263 | + "channels_path = (RAW_DIR / \"df_channels_en.tsv.gz\").resolve()\n", |
262 | 264 | "\n", |
263 | 265 | "# Optional: quick sanity checks / peek at helper metadata (lightweight)\n", |
264 | 266 | "if meta_path.exists():\n", |
|
2310 | 2312 | "# ### 2.2.1 Building the groups\n", |
2311 | 2313 | "\n", |
2312 | 2314 | "author_channel_path = author_channel_out\n", |
2313 | | - "author_groups_path = \"../Dataset/custom_3/author_groups.tsv.gz\"\n", |
2314 | | - "groups_channelid_numc_path = \"../Dataset/custom_3/groups_channelid_numc.tsv.gz\"\n", |
| 2315 | + "author_groups_path = (MODELS_DIR / \"author_groups.tsv.gz\").resolve()\n", |
| 2316 | + "groups_channelid_numc_path = (MODELS_DIR / \"groups_channelid_numc.tsv.gz\").resolve()\n", |
2315 | 2317 | "\n", |
2316 | 2318 | "build_author_groups_and_group_channels(\n", |
2317 | 2319 | " path=author_channel_path,\n", |
|
2349 | 2351 | "metadata": {}, |
2350 | 2352 | "outputs": [], |
2351 | 2353 | "source": [ |
| 2354 | + "out_path = (MODELS_DIR / \"groups_num_authors.tsv.gz\").resolve()\n", |
2352 | 2355 | "compute_groups_num_authors(\n", |
2353 | | - " author_groups_path=\"../Dataset/custom_3/author_groups.tsv.gz\",\n", |
2354 | | - " out_path=\"../Dataset/custom_3/groups_num_authors.tsv.gz\",\n", |
| 2356 | + " author_groups_path=author_groups_path,\n", |
| 2357 | + " out_path=out_path,\n", |
2355 | 2358 | ")\n", |
2356 | 2359 | "\n", |
2357 | | - "groups_nauthor = pd.read_csv(\"../Dataset/custom_3/groups_num_authors.tsv.gz\", sep=\"\\t\", compression=\"infer\")\n", |
| 2360 | + "groups_nauthor = pd.read_csv(out_path, sep=\"\\t\", compression=\"infer\")\n", |
2358 | 2361 | "groups_nauthor.head()\n" |
2359 | 2362 | ] |
2360 | 2363 | }, |
|
2377 | 2380 | "import csv\n", |
2378 | 2381 | "from tqdm.notebook import tqdm\n", |
2379 | 2382 | "\n", |
2380 | | - "input_path = \"../Dataset/custom_3/groups_channelid_numc.tsv.gz\"\n", |
2381 | | - "output_path = \"../Dataset/custom_3/groups_num_channels.tsv.gz\"\n", |
| 2383 | + "input_path = (MODELS_DIR / \"groups_channelid_numc.tsv.gz\").resolve()\n", |
| 2384 | + "output_path = (MODELS_DIR / \"groups_num_channels.tsv.gz\").resolve()\n", |
2382 | 2385 | "\n", |
2383 | 2386 | "total_lines = 3_509_000_000\n", |
2384 | 2387 | "\n", |
2385 | 2388 | "\n", |
2386 | 2389 | "compute_groups_num_channels(\n", |
2387 | | - " groups_channelid_numc_path=\"../Dataset/custom_3/groups_channelid_numc.tsv.gz\",\n", |
2388 | | - " out_path=\"../Dataset/custom_3/groups_num_channels.tsv.gz\",\n", |
| 2390 | + " groups_channelid_numc_path=input_path,\n", |
| 2391 | + " out_path=output_path,\n", |
2389 | 2392 | " total_lines=3_509_000_000, # or None if you don't care about tqdm\n", |
2390 | 2393 | ")\n", |
2391 | 2394 | "\n" |
|
2418 | 2421 | "import csv\n", |
2419 | 2422 | "from tqdm import tqdm\n", |
2420 | 2423 | "\n", |
2421 | | - "input_path = \"../Dataset/custom_3/groups_channelid_numc.tsv.gz\"\n", |
2422 | | - "output_path = \"../Dataset/custom_3/groups_total_comments.tsv.gz\"\n", |
| 2424 | + "input_path = (MODELS_DIR / \"groups_channelid_numc.tsv.gz\").resolve()\n", |
| 2425 | + "output_path = (MODELS_DIR / \"groups_total_comments.tsv.gz\").resolve()\n", |
2423 | 2426 | "\n", |
2424 | 2427 | "compute_groups_total_comments(\n", |
2425 | | - " groups_channelid_numc_path=\"../Dataset/custom_3/groups_channelid_numc.tsv.gz\",\n", |
2426 | | - " out_path=\"../Dataset/custom_3/groups_total_comments.tsv.gz\",\n", |
| 2428 | + " groups_channelid_numc_path=input_path,\n", |
| 2429 | + " out_path=output_path,\n", |
2427 | 2430 | " total_lines=3_509_000_000,\n", |
2428 | 2431 | ")\n" |
2429 | 2432 | ] |
|
2462 | 2465 | }, |
2463 | 2466 | { |
2464 | 2467 | "cell_type": "code", |
2465 | | - "execution_count": 2, |
| 2468 | + "execution_count": null, |
2466 | 2469 | "id": "d6745f51af2dc3bc", |
2467 | 2470 | "metadata": { |
2468 | 2471 | "ExecuteTime": { |
|
2486 | 2489 | "import pandas as pd\n", |
2487 | 2490 | "import matplotlib.pyplot as plt\n", |
2488 | 2491 | "\n", |
2489 | | - "groups_nauthor = pd.read_csv(\"../Dataset/custom_3/groups_num_authors.tsv.gz\", sep=\"\\t\", compression=\"infer\")\n", |
| 2492 | + "groups_nauthor_path = (MODELS_DIR / \"groups_num_authors.tsv.gz\").resolve()\n", |
| 2493 | + "\n", |
| 2494 | + "groups_nauthor = pd.read_csv(groups_nauthor_path, sep=\"\\t\", compression=\"infer\")\n", |
2490 | 2495 | "\n", |
2491 | 2496 | "plt.figure(figsize=(8, 5))\n", |
2492 | 2497 | "plt.hist(groups_nauthor[\"num_authors\"], log=True, bins=50)\n", |
|
2498 | 2503 | }, |
2499 | 2504 | { |
2500 | 2505 | "cell_type": "code", |
2501 | | - "execution_count": 3, |
| 2506 | + "execution_count": null, |
2502 | 2507 | "id": "f7081af991c846c8", |
2503 | 2508 | "metadata": { |
2504 | 2509 | "ExecuteTime": { |
|
2522 | 2527 | "import pandas as pd\n", |
2523 | 2528 | "import matplotlib.pyplot as plt\n", |
2524 | 2529 | "\n", |
| 2530 | + "groups_total_comments_path = (MODELS_DIR / \"groups_total_comments.tsv.gz\").resolve()\n", |
| 2531 | + "\n", |
2525 | 2532 | "total_c = pd.read_csv(\n", |
2526 | | - " \"../Dataset/custom_3/groups_total_comments.tsv.gz\",\n", |
| 2533 | + " groups_total_comments_path,\n", |
2527 | 2534 | " sep=\"\\t\",\n", |
2528 | 2535 | " compression=\"infer\"\n", |
2529 | 2536 | ")\n", |
|
2538 | 2545 | }, |
2539 | 2546 | { |
2540 | 2547 | "cell_type": "code", |
2541 | | - "execution_count": 4, |
| 2548 | + "execution_count": null, |
2542 | 2549 | "id": "f0b90b12bc375726", |
2543 | 2550 | "metadata": { |
2544 | 2551 | "ExecuteTime": { |
|
2562 | 2569 | "import pandas as pd\n", |
2563 | 2570 | "import matplotlib.pyplot as plt\n", |
2564 | 2571 | "\n", |
| 2572 | + "groups_num_channels_path = (MODELS_DIR / \"groups_num_channels.tsv.gz\").resolve()\n", |
2565 | 2573 | "n_channels = pd.read_csv(\n", |
2566 | | - " \"../Dataset/custom_3/groups_num_channels.tsv.gz\",\n", |
| 2574 | + " groups_num_channels_path,\n", |
2567 | 2575 | " sep=\"\\t\",\n", |
2568 | 2576 | " compression=\"infer\"\n", |
2569 | 2577 | ")\n", |
|
2601 | 2609 | "source": [ |
2602 | 2610 | "OUT_DIR = \"web/public/data\"\n", |
2603 | 2611 | "\n", |
| 2612 | + "gna_path = (MODELS_DIR / \"groups_num_authors.tsv.gz\").resolve()\n", |
| 2613 | + "gnc_path = (MODELS_DIR / \"groups_num_channels.tsv.gz\").resolve()\n", |
| 2614 | + "gtc_path = (MODELS_DIR / \"groups_total_comments.tsv.gz\").resolve()\n", |
| 2615 | + "\n", |
2604 | 2616 | "export_group_histograms(\n", |
2605 | | - " groups_num_authors_path=\"../Dataset/custom_3/groups_num_authors.tsv.gz\",\n", |
2606 | | - " groups_total_comments_path=\"../Dataset/custom_3/groups_total_comments.tsv.gz\",\n", |
2607 | | - " groups_num_channels_path=\"../Dataset/custom_3/groups_num_channels.tsv.gz\",\n", |
| 2617 | + " groups_num_authors_path=gna_path,\n", |
| 2618 | + " groups_total_comments_path=gtc_path,\n", |
| 2619 | + " groups_num_channels_path=gnc_path,\n", |
2608 | 2620 | " out_dir=OUT_DIR,\n", |
2609 | 2621 | ")\n" |
2610 | 2622 | ] |
|
2667 | 2679 | "metadata": {}, |
2668 | 2680 | "outputs": [], |
2669 | 2681 | "source": [ |
| 2682 | + "groups_path = (MODELS_DIR / \"groups_channelid_numc.tsv.gz\").resolve()\n", |
| 2683 | + "channels_path = (RAW_DIR / \"df_channels_en.tsv.gz\").resolve()\n", |
| 2684 | + "out_path = (MODELS_DIR / \"groups_features.tsv.gz\").resolve()\n", |
| 2685 | + "\n", |
2670 | 2686 | "build_group_features(\n", |
2671 | | - " groups_path=\"../Dataset/custom_3/groups_channelid_numc.tsv.gz\",\n", |
2672 | | - " channels_path=\"../Dataset/original/df_channels.tsv\",\n", |
2673 | | - " out_path=\"../Dataset/custom_3/groups_features.tsv.gz\",\n", |
2674 | | - " total_rows=3_239_637_783, # whatever you used as total_rows\n", |
| 2687 | + " groups_path=groups_path,\n", |
| 2688 | + " channels_path=channels_path,\n", |
| 2689 | + " out_path=out_path,\n", |
| 2690 | + " total_rows=3_239_637_783, # whatever we used as total_rows\n", |
2675 | 2691 | ")\n" |
2676 | 2692 | ] |
2677 | 2693 | }, |
|
2685 | 2701 | }, |
2686 | 2702 | { |
2687 | 2703 | "cell_type": "code", |
2688 | | - "execution_count": 6, |
| 2704 | + "execution_count": null, |
2689 | 2705 | "id": "f4d29261dbf88949", |
2690 | 2706 | "metadata": { |
2691 | 2707 | "ExecuteTime": { |
|
2729 | 2745 | "import pandas as pd\n", |
2730 | 2746 | "import matplotlib.pyplot as plt\n", |
2731 | 2747 | "\n", |
2732 | | - "feat = pd.read_csv(\"../Dataset/custom_3/groups_features.tsv.gz\", sep=\"\\t\", compression=\"infer\")\n", |
| 2748 | + "fp = (MODELS_DIR / \"groups_features.tsv.gz\").resolve()\n", |
| 2749 | + "\n", |
| 2750 | + "feat = pd.read_csv(fp, sep=\"\\t\", compression=\"infer\")\n", |
2733 | 2751 | "\n", |
2734 | 2752 | "feat = feat[(feat[\"total_comments\"] > 10) & (feat[\"num_channels\"] < 1000)]\n", |
2735 | 2753 | "\n", |
|
2777 | 2795 | "source": [ |
2778 | 2796 | "OUT_DIR = \"web/public/data\"\n", |
2779 | 2797 | "\n", |
| 2798 | + "fp = (MODELS_DIR / \"groups_features.tsv.gz\").resolve()\n", |
| 2799 | + "\n", |
2780 | 2800 | "export_feature_distributions(\n", |
2781 | | - " features_path=\"../Dataset/custom_3/groups_features.tsv.gz\",\n", |
| 2801 | + " features_path=fp,\n", |
2782 | 2802 | " out_dir=OUT_DIR,\n", |
2783 | 2803 | " min_tc=10,\n", |
2784 | 2804 | " max_tc=1000,\n", |
|
2800 | 2820 | "OUT_DIR = \"web/public/data\"\n", |
2801 | 2821 | "os.makedirs(OUT_DIR, exist_ok=True)\n", |
2802 | 2822 | "\n", |
2803 | | - "feat = pd.read_csv(\"../Dataset/custom_3/groups_features.tsv.gz\", sep=\"\\t\", compression=\"infer\")\n", |
| 2823 | + "fp = (MODELS_DIR / \"groups_features.tsv.gz\").resolve()\n", |
| 2824 | + "\n", |
| 2825 | + "feat = pd.read_csv(fp, sep=\"\\t\", compression=\"infer\")\n", |
2804 | 2826 | "for c in [\"total_comments\",\"num_channels\",\"fidelity\",\"category_entropy\"]:\n", |
2805 | 2827 | " feat[c] = pd.to_numeric(feat[c], errors=\"coerce\")\n", |
2806 | 2828 | "\n", |
|
2866 | 2888 | "metadata": {}, |
2867 | 2889 | "outputs": [], |
2868 | 2890 | "source": [ |
2869 | | - "features_path = \"../Dataset/custom_3/groups_features.tsv.gz\"\n", |
2870 | | - "clusters_path = \"../Dataset/custom_3/groups_kmeans_10.tsv.gz\"\n", |
| 2891 | + "features_path = (MODELS_DIR / \"groups_features.tsv.gz\").resolve()\n", |
| 2892 | + "clusters_path = (MODELS_DIR / \"groups_kmeans_10.tsv.gz\").resolve()\n", |
2871 | 2893 | "\n", |
2872 | 2894 | "run_kmeans_on_groups(\n", |
2873 | 2895 | " features_path=features_path,\n", |
|
2885 | 2907 | "metadata": {}, |
2886 | 2908 | "outputs": [], |
2887 | 2909 | "source": [ |
| 2910 | + "authors_path = (MODELS_DIR / \"groups_num_authors.tsv.gz\").resolve()\n", |
| 2911 | + "clusters_path = (MODELS_DIR / \"groups_kmeans_10.tsv.gz\").resolve()\n", |
| 2912 | + "out_path = (MODELS_DIR / \"kmeans_cluster_people_share.tsv\").resolve()\n", |
| 2913 | + "\n", |
2888 | 2914 | "summarize_kmeans_cluster_people_share(\n", |
2889 | 2915 | " clusters_path=clusters_path,\n", |
2890 | | - " authors_path=\"../Dataset/custom_3/groups_num_authors.tsv.gz\",\n", |
2891 | | - " out_path=\"../Dataset/custom_3/kmeans_cluster_people_share.tsv\",\n", |
| 2916 | + " authors_path=authors_path,\n", |
| 2917 | + " out_path=out_path,\n", |
2892 | 2918 | ")\n" |
2893 | 2919 | ] |
2894 | 2920 | }, |
|
2907 | 2933 | "metadata": {}, |
2908 | 2934 | "outputs": [], |
2909 | 2935 | "source": [ |
| 2936 | + "features_path = (MODELS_DIR / \"groups_features.tsv.gz\").resolve()\n", |
| 2937 | + "clusters_path = (MODELS_DIR / \"groups_kmeans_15_1000.tsv.gz\").resolve()\n", |
2910 | 2938 | "plot_kmeans_clusters(\n", |
2911 | | - " features_path=\"../Dataset/custom_3/groups_features.tsv.gz\",\n", |
2912 | | - " clusters_path=\"../Dataset/custom_3/groups_kmeans_15_1000.tsv.gz\",\n", |
| 2939 | + " features_path=features_path,\n", |
| 2940 | + " clusters_path=clusters_path,\n", |
2913 | 2941 | " min_tc=10,\n", |
2914 | 2942 | " max_tc=1000,\n", |
2915 | 2943 | " max_points=300_000,\n", |
|
2936 | 2964 | }, |
2937 | 2965 | "outputs": [], |
2938 | 2966 | "source": [ |
| 2967 | + "\n", |
2939 | 2968 | "export_kmeans_explorer_assets(\n", |
2940 | | - " features_path=\"../Dataset/custom_3/groups_features.tsv.gz\",\n", |
| 2969 | + " features_path=features_path,\n", |
2941 | 2970 | " out_dir=\"web/public/data/kmeans_explorer\",\n", |
2942 | 2971 | " min_tc=10,\n", |
2943 | 2972 | " max_tc=1000,\n", |
|
3007 | 3036 | "metadata": {}, |
3008 | 3037 | "outputs": [], |
3009 | 3038 | "source": [ |
| 3039 | + "gp = (MODELS_DIR / \"groups_channelid_numc.tsv.gz\").resolve()\n", |
| 3040 | + "ch = (RAW_DIR / \"df_channels_en.tsv.gz\").resolve()\n", |
| 3041 | + "out = (MODELS_DIR / \"groups_category_numc.tsv.gz\").resolve()\n", |
3010 | 3042 | "category_analysis_for_groups(\n", |
3011 | | - " groups_path=\"../Dataset/custom_3/groups_channelid_numc.tsv.gz\",\n", |
3012 | | - " channels_path=\"../Dataset/original/df_channels.tsv\",\n", |
3013 | | - " out_path=\"../Dataset/custom_3/groups_category_numc.tsv.gz\",\n", |
| 3043 | + " groups_path=gp,\n", |
| 3044 | + " channels_path=ch,\n", |
| 3045 | + " out_path=out,\n", |
3014 | 3046 | " total_lines=3_509_000_000,\n", |
3015 | 3047 | ")\n" |
3016 | 3048 | ] |
|
0 commit comments