From 3206d902bca167c1354d1d438818149ab9d2ec5e Mon Sep 17 00:00:00 2001 From: Tri Nguyen Date: Thu, 28 Mar 2024 19:56:40 +0700 Subject: [PATCH] migration: migrate DB running on vps to Render --- .../crawling-checkpoint.ipynb | 358 ------------------ 1 file changed, 358 deletions(-) delete mode 100644 scripts/crawling/.ipynb_checkpoints/crawling-checkpoint.ipynb diff --git a/scripts/crawling/.ipynb_checkpoints/crawling-checkpoint.ipynb b/scripts/crawling/.ipynb_checkpoints/crawling-checkpoint.ipynb deleted file mode 100644 index 05d508c..0000000 --- a/scripts/crawling/.ipynb_checkpoints/crawling-checkpoint.ipynb +++ /dev/null @@ -1,358 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 81, - "id": "459ac3b8-31b0-4219-a849-a227848e97e1", - "metadata": {}, - "outputs": [], - "source": [ - "from selenium import webdriver\n", - "import pickle\n", - "from selenium.webdriver.common.by import By\n", - "import pandas as pd\n", - "import time" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "5abf463e-f64d-440f-8863-592440650ffc", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "def save_cookie(driver):\n", - " with open(\"cookie\", 'wb') as filehandler:\n", - " pickle.dump(driver.get_cookies(), filehandler)\n", - "def load_cookie(driver):\n", - " with open(\"cookie\", 'rb') as cookiesfile:\n", - " cookies = pickle.load(cookiesfile)\n", - " for cookie in cookies:\n", - " print(cookie)\n", - " driver.add_cookie(cookie)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 82, - "id": "4f8b8e4e-4b18-48fd-b0b8-c9815e5a7631", - "metadata": {}, - "outputs": [], - "source": [ - "driver = webdriver.Chrome()" - ] - }, - { - "cell_type": "code", - "execution_count": 83, - "id": "1b0885e3-b7d2-4e39-ae1d-337ec940f358", - "metadata": {}, - "outputs": [], - "source": [ - "driver.get(\"https://study4.com/tests/4590/ets-23-toeic-test-1/practice/?part=9773&part=9774&part=9775&part=9776&part=9777&part=9778&part=9779\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": 84, - "id": "d5f85095-684f-41b1-85c9-2f2ea3389b61", - "metadata": {}, - "outputs": [], - "source": [ - "driver.add_cookie({\"name\": \"sessionid\", \"value\": \"enuahhni1nzhwklu1geawox96uaqoece\"})\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "20cfcc53-84c2-400c-9b26-329d90119d36", - "metadata": {}, - "outputs": [], - "source": [ - "driver.get(\"https://study4.com/tests/4590/ets-23-toeic-test-1/practice/?part=9773&part=9774&part=9775&part=9776&part=9777&part=9778&part=9779\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": 85, - "id": "b3335a8c-2ae2-42ca-a8bb-cdb3de2000b6", - "metadata": {}, - "outputs": [], - "source": [ - "test1part5 = driver.find_element(by=By.XPATH, value=\"//*[@id='partcontent-9777']/div[2]\")\n", - "test1part7 = driver.find_element(by=By.XPATH, value=\"//*[@id='partcontent-9779']/div[2]\")\n", - "test1part6 = driver.find_element(by=By.XPATH, value=\"//*[@id='partcontent-9778']/div[2]\")" - ] - }, - { - "cell_type": "code", - "execution_count": 86, - "id": "91837355-ca2a-4859-a3f2-0a91bc823188", - "metadata": {}, - "outputs": [], - "source": [ - "children5 = test1part5.find_elements(By.XPATH, '*')\n", - "children7 = test1part7.find_elements(By.XPATH, '*')\n", - "children6 = test1part6.find_elements(By.XPATH, '*')" - ] - }, - { - "cell_type": "code", - "execution_count": 96, - "id": "40e65682-8cc9-41b6-8778-e98eaa147481", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "131\n", - "A. locate\n", - "B. located\n", - "C. locates\n", - "D. location\n", - "132\n", - "A. The results will be announced later this month.\n", - "B. We are proud to serve our community with excellence.\n", - "C. Pat and Kenny’s shop excelled in all four categories.\n", - "D. Please call in advance to schedule an appointment.\n", - "133\n", - "A. I\n", - "B. We\n", - "C. They\n", - "D. He\n", - "134\n", - "A. While\n", - "B. Despite\n", - "C. Even\n", - "D. Yet\n" - ] - } - ], - "source": [ - "# Part 6\n", - "res = []\n", - "for block in children6:\n", - " res.append(block.text)\n", - "for ch in res:\n", - " print(ch)\n", - " break\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": 105, - "id": "424b0c80-5708-4408-b7fa-18178450378e", - "metadata": {}, - "outputs": [], - "source": [ - "data = {}\n", - "idx = []\n", - "question = []\n", - "answers = []\n", - "nu = []\n", - "for group in res:\n", - " curGroup = group.split('\\n')\n", - " curlen = len(curGroup)\n", - " rows = curlen // 5\n", - " for i in range(rows):\n", - " idx.append(curGroup[5*i])\n", - " question.append('Question')\n", - " for j in range(4):\n", - " if j != 0:\n", - " idx.append('')\n", - " question.append('')\n", - " \n", - " answers.append(curGroup[1+5*i+j])\n", - " \n", - " nu.append('')\n", - "data = {'Question_id': idx, 'Question_text': question, 'Test_name(ETS23-Test1)':nu,'Answer_id':nu, 'Answer_text': answers, 'isTrue':nu}\n", - "df = pd.DataFrame(data).reset_index(drop=True)\n", - "df\n", - "df.to_csv(\"part6.csv\", index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 76, - "id": "4c613023-178a-4d9d-a311-7045e36caa21", - "metadata": {}, - "outputs": [], - "source": [ - "# Part 7\n", - "data = {}\n", - "idx = []\n", - "question = []\n", - "answers = []\n", - "nu = []\n", - "for group in res:\n", - " curGroup = group.split('\\n')\n", - " curlen = len(curGroup)\n", - " rows = curlen // 6\n", - " for i in range(rows):\n", - " idx.append(curGroup[6*i])\n", - " question.append(curGroup[6*i+1])\n", - " for j in range(4):\n", - " if j != 0:\n", - " idx.append('')\n", - " question.append('')\n", - " \n", - " answers.append(curGroup[2+6*i+j])\n", - " \n", - " nu.append('')\n", - "data = {'Question_id': idx, 'Question_text': question, 'Test_name(ETS23-Test1)':nu,'Answer_id':nu, 'Answer_text': answers, 'isTrue':nu}\n", - "df = pd.DataFrame(data).reset_index(drop=True)\n", - "df\n", - "df.to_csv(\"part7.csv\", index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 87, - "id": "7c3d8f36-9e9a-42dc-bf49-4d1499d38b1d", - "metadata": {}, - "outputs": [], - "source": [ - "# Part 5\n", - "data = {}\n", - "idx = []\n", - "question = []\n", - "answers = []\n", - "nu = []\n", - "for row in res:\n", - " curRow = row.split('\\n')\n", - " if len(curRow) == 6:\n", - " idx.append(curRow[0])\n", - " question.append(curRow[1])\n", - " for i in range(4):\n", - " if i != 0:\n", - " idx.append('')\n", - " question.append('')\n", - " answers.append(curRow[2+i])\n", - " nu.append('')\n", - "data = {'Question_id': idx, 'Question_text': question, 'Test_name(ETS23-Test1)':nu,'Answer_id':nu, 'Answer_text': answers, 'isTrue':nu}\n", - "df = pd.DataFrame(data).reset_index(drop=True)\n", - " \n", - "df.to_csv(\"part5.csv\", index=False)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 82, - "id": "3c61228c-2b6a-4915-b29d-c6bf1745f282", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idxquestionsanswers
0101When she held her last meeting, Ms. Toba -----...A. encourage
1101When she held her last meeting, Ms. Toba -----...B. is encouraging
2101When she held her last meeting, Ms. Toba -----...C. encouraged
3101When she held her last meeting, Ms. Toba -----...D. was encouraged
4102All staff have been informed ------- the propo...A. for
\n", - "
" - ], - "text/plain": [ - " idx questions answers\n", - "0 101 When she held her last meeting, Ms. Toba -----... A. encourage\n", - "1 101 When she held her last meeting, Ms. Toba -----... B. is encouraging\n", - "2 101 When she held her last meeting, Ms. Toba -----... C. encouraged\n", - "3 101 When she held her last meeting, Ms. Toba -----... D. was encouraged\n", - "4 102 All staff have been informed ------- the propo... A. for" - ] - }, - "execution_count": 82, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 79, - "id": "973f70df-1cd6-42bd-b992-48a23ed4d379", - "metadata": {}, - "outputs": [], - "source": [ - "driver.quit()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.4" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}