Skip to content

Commit 276b28d

Browse files
committed
Merge branch 'master' into raw
2 parents 05f5b60 + f2c4aa4 commit 276b28d

14 files changed

+231
-164
lines changed

.github/workflows/elixir.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ jobs:
1010
build:
1111

1212
name: Build and test
13-
runs-on: ubuntu-latest
13+
runs-on: ubuntu-20.04
1414

1515
steps:
1616
- name: Check out the repository

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,4 +25,5 @@ onigumo-*.tar
2525
# Temporary files, for example, from tests.
2626
/tmp/
2727

28-
onigumo
28+
# Ignore onigumo escript file
29+
/onigumo

README.md

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,13 @@ flowchart LR
2626
2727
onigumo_operator --> spider_materialization[MATERIALIZER]
2828
29-
subgraph Onigumo
29+
subgraph "Onigumo (kernel)"
3030
onigumo_operator
3131
onigumo_downloader
3232
onigumo_parser
3333
end
3434
35-
subgraph Spider
35+
subgraph "Spider (application)"
3636
spider_operator
3737
spider_parser
3838
spider_materialization
@@ -51,14 +51,14 @@ The Operator’s job is to:
5151

5252
### Downloader ###
5353

54-
Stahuje obsah a metadata nezpracovaných URL adres.
54+
The Downloader fetches and saves the contents and metadata from the unprocessed URL addresses.
5555

56-
Činnost _downloaderu_ se skládá z:
56+
The Downloader’s job is to:
5757

58-
1. načítání URL ke stažení,
59-
2. kontroly stažených URL,
60-
3. stahování obsahu URL a případných metadat,
61-
4. uložení stažených dat.
58+
1. read URLs for download,
59+
2. check for the already downloaded URLs,
60+
3. fetch the URLs contents along with its metadata,
61+
4. save the downloaded data.
6262

6363
### Parser ###
6464

lib/cli.ex

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
defmodule Onigumo.CLI do
2+
def main([component]) do
3+
module = Module.safe_concat("Onigumo", component)
4+
root_path = File.cwd!()
5+
module.main(root_path)
6+
end
7+
end

lib/onigumo.ex renamed to lib/onigumo/downloader.ex

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,11 @@ defmodule Onigumo.Downloader do
66
def main(root_path) do
77
http_client().start()
88

9-
download_urls_from_file(root_path)
9+
create_download_stream(root_path)
1010
|> Stream.run()
1111
end
1212

13-
def download_urls_from_file(root_path) do
13+
def create_download_stream(root_path) do
1414
root_path
1515
|> load_urls()
1616
|> Stream.map(&download_url(&1, root_path))
@@ -51,7 +51,7 @@ defmodule Onigumo.Downloader do
5151

5252
def create_file_name(url) do
5353
suffix = Application.get_env(:onigumo, :downloaded_suffix)
54-
Hash.md5(url, :hex) <> suffix
54+
Onigumo.Utilities.Hash.md5(url, :hex) <> suffix
5555
end
5656

5757
defp http_client() do

lib/onigumo_cli.ex

Lines changed: 0 additions & 6 deletions
This file was deleted.

lib/spider/html.ex

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
defmodule Onigumo.Spider.HTML do
2+
def find_links(document) do
3+
Floki.parse_document!(document)
4+
|> Floki.find("a")
5+
|> Floki.attribute("href")
6+
end
7+
end

lib/hash.ex renamed to lib/utilities/hash.ex

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
defmodule Hash do
1+
defmodule Onigumo.Utilities.Hash do
22
def md5(data, fmt) do
33
hash(:md5, data)
44
|> format(fmt)

mix.exs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,10 @@ defmodule Onigumo.MixProject do
2525
# {:dep_from_hexpm, "~> 0.3.0"},
2626
# {:dep_from_git, git: "https://github.com/elixir-lang/my_dep.git", tag: "0.1.0"},
2727
{:httpoison, "~> 1.8"},
28-
{:mox, "~> 1.0", only: :test}
28+
{:mox, "~> 1.0", only: :test},
29+
30+
# Spider toolbox dependencies
31+
{:floki, "~> 0.32"}
2932
]
3033
end
3134

mix.lock

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
%{
22
"certifi": {:hex, :certifi, "2.8.0", "d4fb0a6bb20b7c9c3643e22507e42f356ac090a1dcea9ab99e27e0376d695eba", [:rebar3], [], "hexpm", "6ac7efc1c6f8600b08d625292d4bbf584e14847ce1b6b5c44d983d273e1097ea"},
3+
"floki": {:hex, :floki, "0.32.1", "dfe3b8db3b793939c264e6f785bca01753d17318d144bd44b407fb3493acaa87", [:mix], [{:html_entities, "~> 0.5.0", [hex: :html_entities, repo: "hexpm", optional: false]}], "hexpm", "d4b91c713e4a784a3f7b1e3cc016eefc619f6b1c3898464222867cafd3c681a3"},
34
"hackney": {:hex, :hackney, "1.18.0", "c4443d960bb9fba6d01161d01cd81173089686717d9490e5d3606644c48d121f", [:rebar3], [{:certifi, "~>2.8.0", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "~>6.1.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "~>1.0.0", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~>1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:parse_trans, "3.3.1", [hex: :parse_trans, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "~>1.1.0", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}, {:unicode_util_compat, "~>0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "9afcda620704d720db8c6a3123e9848d09c87586dc1c10479c42627b905b5c5e"},
5+
"html_entities": {:hex, :html_entities, "0.5.2", "9e47e70598da7de2a9ff6af8758399251db6dbb7eebe2b013f2bbd2515895c3c", [:mix], [], "hexpm", "c53ba390403485615623b9531e97696f076ed415e8d8058b1dbaa28181f4fdcc"},
46
"httpoison": {:hex, :httpoison, "1.8.0", "6b85dea15820b7804ef607ff78406ab449dd78bed923a49c7160e1886e987a3d", [:mix], [{:hackney, "~> 1.17", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm", "28089eaa98cf90c66265b6b5ad87c59a3729bea2e74e9d08f9b51eb9729b3c3a"},
57
"idna": {:hex, :idna, "6.1.1", "8a63070e9f7d0c62eb9d9fcb360a7de382448200fbbd1b106cc96d3d8099df8d", [:rebar3], [{:unicode_util_compat, "~>0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "92376eb7894412ed19ac475e4a86f7b413c1b9fbb5bd16dccd57934157944cea"},
68
"metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], [], "hexpm", "69b09adddc4f74a40716ae54d140f93beb0fb8978d8636eaded0c31b6f099f16"},

test/hash_test.exs

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -51,31 +51,31 @@ defmodule HashTest do
5151
}
5252
]
5353

54-
test("hash MD5 known value in hexadecimal") do
55-
for {data, hash_hex, _} <- @known_md5s do
56-
hash = Hash.md5(data, :hex)
57-
assert(hash == hash_hex)
54+
for {data, hash_hex, _} <- @known_md5s do
55+
test("hash MD5 #{inspect(data)} in hexadecimal") do
56+
hash = Onigumo.Utilities.Hash.md5(unquote(data), :hex)
57+
assert(hash == unquote(hash_hex))
5858
end
5959
end
6060

61-
test("hash MD5 known value in binary") do
62-
for {data, _, hash_bin} <- @known_md5s do
63-
hash = Hash.md5(data, :bin)
64-
assert(hash == hash_bin)
61+
for {data, _, hash_bin} <- @known_md5s do
62+
test("hash MD5 #{inspect(data)} in binary") do
63+
hash = Onigumo.Utilities.Hash.md5(unquote(data), :bin)
64+
assert(hash == unquote(hash_bin))
6565
end
6666
end
6767

68-
test("format a binary hash") do
69-
for {format, hash} <- @formatted_hashes do
70-
formatted = Hash.format(@binary_hash, format)
71-
assert(formatted == hash)
68+
for {format, hash} <- @formatted_hashes do
69+
test("format #{inspect(@binary_hash)} in #{inspect(format)}") do
70+
formatted = Onigumo.Utilities.Hash.format(@binary_hash, unquote(format))
71+
assert(formatted == unquote(hash))
7272
end
7373
end
7474

75-
test("hash a known value") do
76-
for {func, known_hash} <- @known_hashes do
77-
computed_hash = Hash.hash(func, @known_hash_data)
78-
assert(computed_hash == known_hash)
75+
for {func, known_hash} <- @known_hashes do
76+
test("hash #{inspect(@known_hash_data)} with #{inspect(func)}") do
77+
computed_hash = Onigumo.Utilities.Hash.hash(unquote(func), @known_hash_data)
78+
assert(computed_hash == unquote(known_hash))
7979
end
8080
end
8181
end

test/onigumo_downloader_test.exs

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
defmodule OnigumoDownloaderTest do
2+
use ExUnit.Case
3+
import Mox
4+
5+
@urls [
6+
"http://onigumo.local/hello.html",
7+
"http://onigumo.local/bye.html"
8+
]
9+
@slices [0..1, 0..-1]
10+
11+
setup(:verify_on_exit!)
12+
13+
describe("Onigumo.Downloader.main/1") do
14+
@tag :tmp_dir
15+
test("run Downloader", %{tmp_dir: tmp_dir}) do
16+
expect(HTTPoisonMock, :start, fn -> nil end)
17+
expect(HTTPoisonMock, :get!, length(@urls), &prepare_response/1)
18+
19+
input_path_env = Application.get_env(:onigumo, :input_path)
20+
input_path_tmp = Path.join(tmp_dir, input_path_env)
21+
input_file_content = prepare_input(@urls)
22+
File.write!(input_path_tmp, input_file_content)
23+
24+
Onigumo.Downloader.main(tmp_dir)
25+
26+
Enum.map(@urls, &assert_downloaded(&1, tmp_dir))
27+
end
28+
end
29+
30+
describe("Onigumo.Downloader.create_download_stream/1") do
31+
@tag :tmp_dir
32+
test("download URLs from the input file with a created stream", %{tmp_dir: tmp_dir}) do
33+
expect(HTTPoisonMock, :get!, length(@urls), &prepare_response/1)
34+
35+
input_path_env = Application.get_env(:onigumo, :input_path)
36+
input_path_tmp = Path.join(tmp_dir, input_path_env)
37+
input_file_content = prepare_input(@urls)
38+
File.write!(input_path_tmp, input_file_content)
39+
40+
Onigumo.Downloader.create_download_stream(tmp_dir) |> Stream.run()
41+
42+
Enum.map(@urls, &assert_downloaded(&1, tmp_dir))
43+
end
44+
end
45+
46+
describe("Onigumo.Downloader.download_url/2") do
47+
@tag :tmp_dir
48+
test("download a URL", %{tmp_dir: tmp_dir}) do
49+
expect(HTTPoisonMock, :get!, &prepare_response/1)
50+
51+
input_url = Enum.at(@urls, 0)
52+
Onigumo.Downloader.download_url(input_url, tmp_dir)
53+
54+
output_file_name = Onigumo.Downloader.create_file_name(input_url)
55+
output_path = Path.join(tmp_dir, output_file_name)
56+
read_output = File.read!(output_path)
57+
expected_output = body(input_url)
58+
assert(read_output == expected_output)
59+
end
60+
end
61+
62+
describe("Onigumo.Downloader.get_url/1") do
63+
test("get response by HTTP request") do
64+
expect(HTTPoisonMock, :get!, &prepare_response/1)
65+
66+
url = Enum.at(@urls, 0)
67+
get_response = Onigumo.Downloader.get_url(url)
68+
expected_response = prepare_response(url)
69+
assert(get_response == expected_response)
70+
end
71+
end
72+
73+
describe("Onigumo.Downloader.get_body/1") do
74+
test("extract body from URL response") do
75+
url = Enum.at(@urls, 0)
76+
response = prepare_response(url)
77+
get_body = Onigumo.Downloader.get_body(response)
78+
expected_body = body(url)
79+
assert(get_body == expected_body)
80+
end
81+
end
82+
83+
describe("Onigumo.Downloader.write_response/2") do
84+
@tag :tmp_dir
85+
test("write response to file", %{tmp_dir: tmp_dir}) do
86+
response = "Response!"
87+
output_file_name = "body.html"
88+
output_path = Path.join(tmp_dir, output_file_name)
89+
Onigumo.Downloader.write_response(response, output_path)
90+
91+
read_output = File.read!(output_path)
92+
assert(read_output == response)
93+
end
94+
end
95+
96+
describe("Onigumo.Downloader.load_urls/1") do
97+
for slice <- @slices do
98+
@tag :tmp_dir
99+
test("load URLs #{inspect(slice)} from a file", %{tmp_dir: tmp_dir}) do
100+
input_urls = Enum.slice(@urls, unquote(Macro.escape(slice)))
101+
102+
input_path_env = Application.get_env(:onigumo, :input_path)
103+
input_path_tmp = Path.join(tmp_dir, input_path_env)
104+
input_file_content = prepare_input(input_urls)
105+
File.write!(input_path_tmp, input_file_content)
106+
107+
loaded_urls = Onigumo.Downloader.load_urls(tmp_dir) |> Enum.to_list()
108+
109+
assert(loaded_urls == input_urls)
110+
end
111+
end
112+
end
113+
114+
describe("Onigumo.Downloader.create_file_name/1") do
115+
test("create file name from URL") do
116+
input_url = "https://onigumo.local/hello.html"
117+
created_file_name = Onigumo.Downloader.create_file_name(input_url)
118+
119+
input_url_hash = Onigumo.Utilities.Hash.md5(input_url, :hex)
120+
downloaded_suffix = Application.get_env(:onigumo, :downloaded_suffix)
121+
expected_file_name = input_url_hash <> downloaded_suffix
122+
123+
assert(created_file_name == expected_file_name)
124+
end
125+
end
126+
127+
defp prepare_response(url) do
128+
%HTTPoison.Response{
129+
status_code: 200,
130+
body: body(url)
131+
}
132+
end
133+
134+
defp prepare_input(urls) do
135+
Enum.map(urls, &(&1 <> "\n"))
136+
|> Enum.join()
137+
end
138+
139+
defp body(url) do
140+
"Body from: #{url}\n"
141+
end
142+
143+
defp assert_downloaded(url, tmp_dir) do
144+
file_name = Onigumo.Downloader.create_file_name(url)
145+
output_path = Path.join(tmp_dir, file_name)
146+
read_output = File.read!(output_path)
147+
expected_output = body(url)
148+
assert(read_output == expected_output)
149+
end
150+
end

0 commit comments

Comments
 (0)