diff --git a/notebooks/webscraper/2.negative_labels_pscdd.ipynb b/notebooks/webscraper/2.negative_labels_pscdd.ipynb new file mode 100644 index 00000000..cec73048 --- /dev/null +++ b/notebooks/webscraper/2.negative_labels_pscdd.ipynb @@ -0,0 +1,176 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "broken-thought", + "metadata": {}, + "source": [ + "# Negative Labels\n", + "\n", + "\n", + "`elpitazo` news articles are categorized by geographical location. E.g., Gran Caracas, Occidente, Centro, Oriente, Los Llanos, Los Andes, Guayana. \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "neural-perception", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import requests\n", + "from bs4 import BeautifulSoup\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "creative-string", + "metadata": {}, + "outputs": [], + "source": [ + "df_positivelabels_original = pd.read_csv(\"../../data/processed/webscraping/elpitazo_positivelabels_devdataset.csv\")" + ] + }, + { + "cell_type": "markdown", + "id": "civilian-horizon", + "metadata": {}, + "source": [ + "# Descriptive Analysis\n", + "\n", + "In this section I want to understand what kind of news articles should I web scrape.\n", + "\n", + "\n", + "| | count |\n", + "|:-------------------|--------:|\n", + "| occidente | 519 |\n", + "| gran-caracas | 403 |\n", + "| oriente | 396 |\n", + "| los-andes | 287 |\n", + "| los-llanos | 284 |\n", + "| centro | 196 |\n", + "| guayana | 93 |\n", + "| pitazo-en-la-calle | 88 |\n", + "| regiones | 64 |\n", + "| economia | 21 |\n", + "| infociudadanos | 16 |\n", + "| tecnologia | 10 |\n", + "| vista_2 | 8 |\n", + "| reportajes | 4 |\n", + "| radio | 3 |\n", + "| alianzas | 2 |\n", + "| sucesos | 2 |\n", + "| salud | 2 |\n", + "| sin-categoria | 1 |\n", + "| fotogalerias | 1 |\n", + "| cronicas | 1 |\n", + "\n", + "\n", + "```\n", + "count 2401\n", + "unique 397\n", + "top 2020-06-10 00:00:00\n", + "freq 27\n", + "first 2019-05-02 00:00:00\n", + "last 2020-10-30 00:00:00\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dominican-rover", + "metadata": {}, + "outputs": [], + "source": [ + "## Time\n", + "# pd.to_datetime(df_positivelabels_original.fecha, infer_datetime_format=True).describe()\n", + "\n", + "## Location\n", + "vcounts_location = df_positivelabels_original.link_de_la_noticia.str.split(\"/\", expand = True)[3].value_counts()\n", + "vcounts_location.name = \"count\"\n", + "# print(vcounts_location.to_markdown())" + ] + }, + { + "cell_type": "markdown", + "id": "fixed-binding", + "metadata": {}, + "source": [ + "# Web Scraper\n", + "\n", + "## Strategy\n", + "\n", + "1. Loop over `elpitazo.net/category//page/` to get all the links from `PSCDD` positive labels dataset. \n", + "2. Select the links that aren't in `PSCDD` positive labels dataset.\n", + "3. Webscrape this links with `PSCDD` elpitazo web scraper.\n", + "\n", + "\n", + "- [] Create elpitazo page discovery web scraper\n", + " - [x] Extract links\n", + " - [] Extract news articles' date. _This will take a bit more time than I thought_.\n", + " \n", + "- [] Fetch el pitazo links for `occidente` and store it within a list.\n", + "- [] Find links that don't match with `PSCDD` positive labels links.\n", + "- [] Web scrape non-matched positive label links." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "amazing-angola", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "\n", + "def elpitazo_page_discovery(url:str):\n", + " headers = {\n", + " \"User-Agent\": \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36\"\n", + " }\n", + " \n", + " page = requests.get(url, headers=headers, timeout=20)\n", + " \n", + " soup = BeautifulSoup(page.content, \"html.parser\")\n", + " \n", + " ## TODO: Include date to find the articles that match the date of the positive labels PSCDD\n", + "# _date = soup.find_all(\"div\", {\"class\":\"td-editor-date\"})\n", + " \n", + " _links = soup.find_all(\"h3\", {\"class\": \"entry-title td-module-title\"})\n", + " \n", + " ls_links = []\n", + " \n", + " for i in range(len(_links)):\n", + " ls_links.append(_links[i].find_all(\"a\")[0].get(\"href\"))\n", + " \n", + " return ls_links\n", + "\n", + "test = elpitazo_page_discovery(\"https://elpitazo.net/category/occidente/90\")\n", + "test" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}