Skip to content

Commit 2a62997

Browse files
Merge pull request #2194 from NitkarshChourasia/testing
add: scrape animal name success.
2 parents ef5e064 + 3555ff5 commit 2a62997

File tree

4 files changed

+1213
-0
lines changed

4 files changed

+1213
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
import requests
2+
from requests import get
3+
from bs4 import BeautifulSoup
4+
import pandas as pd
5+
import numpy as np
6+
import html5lib
7+
8+
# * Using html5lib as the parser is good
9+
# * It is the most lenient parser and works as
10+
11+
animals_A_to_Z_URL = "https://animalcorner.org/animal-sitemap/#"
12+
13+
results = requests.get(animals_A_to_Z_URL)
14+
# ? results and results.text ? what are these?
15+
16+
# soup = BeautifulSoup(results.text, "html.parser")
17+
# * will use html5lib as the parser
18+
soup = BeautifulSoup(results.text, "html5lib")
19+
20+
# print(soup.prettify())
21+
22+
# To store animal names
23+
animal_name = []
24+
25+
# To store the titles of animals
26+
animal_title = []
27+
28+
# alphabet_head = soup.find_all("div", class_="wp-block-heading")
29+
# alphabet_head = soup.find_all("div", class_="side-container")
30+
# * .text all it's immediate text and children
31+
# * .string only the immediate text
32+
# print(soup.find_all("h2", class_="wp-block-heading"))
33+
# az_title = soup.find_all("h2", class_="wp-block-heading")
34+
az_names = soup.find_all(
35+
"div", class_="wp-block-column is-layout-flow wp-block-column-is-layout-flow"
36+
)
37+
# az_title = soup
38+
# for title in az_title:
39+
# # print(title.text)
40+
# print(title.string)
41+
# print(title.find(class_="wp-block-testing"))
42+
43+
for name_div in az_names:
44+
a_names = name_div.find_all("br")
45+
46+
for elements in a_names:
47+
# print(elements.text)
48+
# print(elements, end="\n")
49+
next_sibling = elements.next_sibling
50+
# Check if the next sibling exists and if it's not a <br> element
51+
while next_sibling and next_sibling.name == "br":
52+
next_sibling = next_sibling.next_sibling
53+
54+
55+
# Print the text content of the next sibling element
56+
if next_sibling:
57+
print(next_sibling.text.strip())
58+
59+
# print(name.text)
60+
61+
# print(soup.h2.string)
62+
63+
# for container in alphabet_head:
64+
# print(container.text, end="\n")
65+
# titles = container.div.div.find("h2", class_="wp-block-heading")
66+
# title = container.find("h2", class_="wp-block-heading")
67+
# title = container.h3.text
68+
# print(title.text, end="\n")
69+
70+
# print(container.find_all("h2", class_ = "wp-block-heading"))
71+
72+
73+
# print(soup.get_text(), end="\p")
74+
75+
# Want to write it to a file and sort and analyse it

0 commit comments

Comments
 (0)