forked from divkakwani/webcorpus
-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathget_crawl_commands.py
126 lines (107 loc) · 4.15 KB
/
get_crawl_commands.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import argparse
import os
from urllib.parse import urlparse, parse_qsl, unquote_plus
import pandas as pd
def normalize_url(url):
if not url.startswith("http") and not url.startswith("https"):
url = "http://{0}".format(url)
parts = urlparse(url)
_query = frozenset(parse_qsl(parts.query))
_path = unquote_plus(parts.path)
parts = parts._replace(query=_query, path=_path)
# strip www. from netloc
domain = parts.netloc.strip("www.")
# path is not important as we are doing a deep crawl
# path = parts[2].strip("/")
return domain
def read_lines(path):
# if path doesnt exist, return empty list
if not os.path.exists(path):
return []
with open(path, "r") as f:
lines = f.readlines()
lines = [line.strip("\n") for line in lines]
return lines
def read_csv_sources(path):
df = pd.read_csv(path)
return df["home_url"].tolist()
def create_txt(outFile, lines):
add_newline = not "\n" in lines[0]
outfile = open("{0}".format(outFile), "w")
for line in lines:
if add_newline:
outfile.write(line + "\n")
else:
outfile.write(line)
outfile.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Get crawl commands for a given text file"
)
parser.add_argument(
"-quiet", "--q", help="Quiet mode", action="store_true", default=False
)
parser.add_argument("-f", "--file", help="Text/CSV file to crawl", required=True)
parser.add_argument(
"-lp", "--log_path", help="Path to webcorpus logs folder", required=True
)
parser.add_argument(
"-of",
"--output_folder",
help="Output folder for saving crawls. The crawls would be saved in {output_folder}/lang_code",
required=True,
)
args = parser.parse_args()
file = args.file
if file.endswith(".csv"):
lines = read_csv_sources(file)
elif file.endswith(".txt"):
lines = read_lines(file)
else:
raise NotImplementedError("Only .txt and .csv files are supported")
if args.q:
print("Found {0} sources".format(len(lines)))
print("samples sources: ", lines[:5])
# convert log_path and output folder to absolute paths
args.log_path = os.path.abspath(args.log_path)
args.output_folder = os.path.abspath(args.output_folder)
# convert windows path to unix path
args.log_path = args.log_path.replace("\\", "/")
args.output_folder = args.output_folder.replace("\\", "/")
# folder = file.split("/")[0]
if "/" in file:
lang_code = file.split("/")[1].split(".")[0]
else:
lang_code = file.split(".")[0]
commands = []
for line in lines:
line = line.strip("\n")
if not line.startswith("http") and not line.startswith("https"):
line = "http://{0}".format(line)
# remove last .com, .org, .net, .edu etc
domain_name = ".".join(normalize_url(line).split(".")[:-1])
if domain_name == "":
print("==============================================")
print(f"recheck this source {line} -> {normalize_url(line)}")
print("==============================================")
exit(1)
# replace . and / in source name to _
domain_name = domain_name.replace(".", "_")
domain_name = domain_name.replace("/", "_")
domain_name = domain_name.replace("\\", "_")
# html_path = f"/home/gowtham_ramesh1/{lang_code}/{domain_name}"
html_path = f"{args.output_folder}/{lang_code}/{domain_name}"
source_name = domain_name
home_url = line
# log_path = "/home/gowtham_ramesh1/webcorpus/logs"
command = f"curl http://localhost/schedule.json -d project=webcorpus -d spider=recursive-spider -d html_path={html_path} -d source_name={source_name} -d home_url={home_url} -d lang={lang_code} -d log_path={args.log_path}"
commands.append(command)
if args.q is False:
print(command)
create_txt(
f"{lang_code}_commands.txt",
commands,
)
if args.q is False:
print()
print("Commands written to {0}_commands.txt".format(lang_code))