Skip to content

Commit 235f335

Browse files
authored
Merge pull request #1 from yennanliu/Scraping-dev-001-code-refine
Scraping-dev-001-code-refine
2 parents b81528b + ab04e26 commit 235f335

12 files changed

+863
-628
lines changed

dev_projects/ScrapingService/src/main/java/com/yen/scrpe/ScrappingApplication.java

Lines changed: 32 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2,40 +2,51 @@
22

33
import com.yen.scrpe.Task.PokemonCollectTask;
44
import com.yen.scrpe.Task.ScrapeTaskFactory;
5+
import com.yen.scrpe.Task.ScrapeTaskFactory2;
56
import com.yen.scrpe.service.ScrapeService;
6-
import com.yen.scrpe.service.ScrapeServiceMultiThread;
7+
import com.yen.scrpe.service.ScrapeServiceMultiThreadV2Gpt;
78

89
import java.io.IOException;
910

1011
/**
11-
* // https://www.zenrows.com/blog/web-scraping-java#java-web-crawling
12-
* // data source : https://scrapeme.live/shop/
13-
*
12+
* // https://www.zenrows.com/blog/web-scraping-java#java-web-crawling // data source :
13+
* https://scrapeme.live/shop/
1414
*/
1515
public class ScrappingApplication {
1616

17-
public static void main(String[] args) throws IOException, InterruptedException {
17+
public static void main(String[] args) throws IOException, InterruptedException {
1818

19-
Long start = System.currentTimeMillis();
19+
Long start = System.currentTimeMillis();
2020

21-
// to limit the number to scrape to 5
22-
int LIMIT = 3; // 50;
21+
// to limit the number to scrape to 5
22+
int LIMIT = 10; // 50;
2323

24-
// ScrapeService scrapeService = new ScrapeService();
25-
// PokemonCollectTask pokemonCollectTask = new PokemonCollectTask(scrapeService);
26-
// pokemonCollectTask.run(LIMIT);
24+
/** V1 : single thread (original code ) */
25+
// ScrapeService scrapeService = new ScrapeService();
26+
// PokemonCollectTask pokemonCollectTask = new PokemonCollectTask(scrapeService);
27+
// pokemonCollectTask.run(LIMIT);
28+
//
29+
// ScrapeTaskFactory scrapeTaskFactory = new ScrapeTaskFactory(scrapeService, pokemonCollectTask, LIMIT);
30+
// scrapeTaskFactory.run();
2731

28-
// ScrapeTaskFactory scrapeTaskFactory = new ScrapeTaskFactory(scrapeService, PokemonCollectTask, LIMIT);
29-
// scrapeTaskFactory.run();
3032

31-
ScrapeServiceMultiThread scrapeServiceMultiThread = new ScrapeServiceMultiThread();
32-
scrapeServiceMultiThread.testRun();
33+
/** V2 : multi thread (gpt) */
34+
ScrapeServiceMultiThreadV2Gpt scrapeService = new ScrapeServiceMultiThreadV2Gpt();
35+
PokemonCollectTask pokemonCollectTask = new PokemonCollectTask(scrapeService);
36+
pokemonCollectTask.run(LIMIT);
3337

34-
// System.out.println("pokemonProducts.size() = " + pokemonCollectTask.getPokemonProducts().size());
35-
// System.out.println("pokemonProducts = " + pokemonCollectTask.getPokemonProducts());
38+
ScrapeTaskFactory2 scrapeTaskFactory = new ScrapeTaskFactory2(scrapeService, pokemonCollectTask, LIMIT);
39+
scrapeTaskFactory.run();
3640

37-
Long end = System.currentTimeMillis();
38-
System.out.println("-----> Total duration = " + ( end - start));
39-
}
41+
// ScrapeServiceMultiThread scrapeServiceMultiThread = new
42+
// ScrapeServiceMultiThread();
43+
// scrapeServiceMultiThread.testRun();
4044

41-
}
45+
// System.out.println("pokemonProducts.size() = " +
46+
// pokemonCollectTask.getPokemonProducts().size());
47+
// System.out.println("pokemonProducts = " + pokemonCollectTask.getPokemonProducts());
48+
49+
Long end = System.currentTimeMillis();
50+
System.out.println("-----> Total duration = " + (end - start));
51+
}
52+
}

dev_projects/ScrapingService/src/main/java/com/yen/scrpe/Task/BaseScrapeTask.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,5 @@
44

55
public interface BaseScrapeTask {
66

7-
public void run(int limit) throws IOException;
7+
void run(int limit) throws IOException, InterruptedException;
88
}

dev_projects/ScrapingService/src/main/java/com/yen/scrpe/Task/PokemonCollectTask.java

Lines changed: 58 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
import com.yen.scrpe.model.PokemonProduct;
44
import com.yen.scrpe.service.BaseScrapeService;
5-
65
import java.io.IOException;
76
import java.util.ArrayList;
87
import java.util.HashSet;
@@ -11,81 +10,81 @@
1110

1211
public class PokemonCollectTask implements BaseScrapeTask {
1312

14-
// attr
15-
//int limit;
13+
// attr
14+
// int limit;
1615

17-
private BaseScrapeService scrapeService;
18-
// initializing the list of Java object to store
19-
// the scraped data
20-
private List<PokemonProduct> pokemonProducts;
21-
// initializing the set of web page urls
22-
private Set<String> pagesDiscovered; // discovered while crawling the target website
23-
// initializing the queue of urls to scrape
24-
private List<String> pagesToScrape;
16+
private BaseScrapeService scrapeService;
17+
// initializing the list of Java object to store
18+
// the scraped data
19+
private List<PokemonProduct> pokemonProducts;
20+
// initializing the set of web page urls
21+
private Set<String> pagesDiscovered; // discovered while crawling the target website
22+
// initializing the queue of urls to scrape
23+
private List<String> pagesToScrape;
2524

26-
// constructor
27-
// PokemonCollectTaskV1.run(scrapeService, pagesToScrape, pokemonProducts, pagesDiscovered, LIMIT);
28-
public PokemonCollectTask(){
25+
// constructor
26+
// PokemonCollectTaskV1.run(scrapeService, pagesToScrape, pokemonProducts, pagesDiscovered,
27+
// LIMIT);
28+
public PokemonCollectTask() {}
2929

30-
}
30+
public PokemonCollectTask(BaseScrapeService scrapeService) {
3131

32-
public PokemonCollectTask(BaseScrapeService scrapeService){
32+
this.scrapeService = scrapeService;
33+
this.pokemonProducts = new ArrayList<>();
34+
this.pagesDiscovered = new HashSet<>();
35+
this.pagesToScrape = new ArrayList<>();
3336

34-
this.scrapeService = scrapeService;
35-
this.pokemonProducts = new ArrayList<>();
36-
this.pagesDiscovered = new HashSet<>();
37-
this.pagesToScrape = new ArrayList<>();
38-
// initializing the scraping queue with the
39-
this.pagesToScrape.add("https://scrapeme.live/shop/page/1/");
40-
}
37+
// initializing the scraping queue
38+
this.pagesToScrape.add("https://scrapeme.live/shop/page/1/");
39+
this.pagesToScrape.add("https://scrapeme.live/shop/page/2/");
40+
}
4141

42-
// getter, setter
43-
public BaseScrapeService getScrapeService() {
44-
return scrapeService;
45-
}
42+
// getter, setter
43+
public BaseScrapeService getScrapeService() {
44+
return scrapeService;
45+
}
4646

47-
public void setScrapeService(BaseScrapeService scrapeService) {
48-
this.scrapeService = scrapeService;
49-
}
47+
public void setScrapeService(BaseScrapeService scrapeService) {
48+
this.scrapeService = scrapeService;
49+
}
5050

51-
public List<PokemonProduct> getPokemonProducts() {
52-
return pokemonProducts;
53-
}
51+
public List<PokemonProduct> getPokemonProducts() {
52+
return pokemonProducts;
53+
}
5454

55-
public void setPokemonProducts(List<PokemonProduct> pokemonProducts) {
56-
this.pokemonProducts = pokemonProducts;
57-
}
55+
public void setPokemonProducts(List<PokemonProduct> pokemonProducts) {
56+
this.pokemonProducts = pokemonProducts;
57+
}
5858

59-
public Set<String> getPagesDiscovered() {
60-
return pagesDiscovered;
61-
}
59+
public Set<String> getPagesDiscovered() {
60+
return pagesDiscovered;
61+
}
6262

63-
public void setPagesDiscovered(Set<String> pagesDiscovered) {
64-
this.pagesDiscovered = pagesDiscovered;
65-
}
63+
public void setPagesDiscovered(Set<String> pagesDiscovered) {
64+
this.pagesDiscovered = pagesDiscovered;
65+
}
6666

67-
public List<String> getPagesToScrape() {
68-
return pagesToScrape;
69-
}
67+
public List<String> getPagesToScrape() {
68+
return pagesToScrape;
69+
}
7070

71-
public void setPagesToScrape(List<String> pagesToScrape) {
72-
this.pagesToScrape = pagesToScrape;
73-
}
71+
public void setPagesToScrape(List<String> pagesToScrape) {
72+
this.pagesToScrape = pagesToScrape;
73+
}
7474

75-
// method
76-
public void run(int limit) throws IOException {
75+
// method
76+
public void run(int limit) throws IOException, InterruptedException {
7777

78-
int i = 0;
78+
int i = 0;
7979

80-
while (!this.pagesToScrape.isEmpty() && i < limit) {
81-
System.out.println(">>> i = " + i);
80+
while (!this.pagesToScrape.isEmpty() && i < limit) {
81+
System.out.println(">>> i = " + i);
8282

83-
/** help func*/
84-
this.scrapeService.scrapeProductPage(pokemonProducts, pagesDiscovered, pagesToScrape, i);
83+
/** help func */
84+
this.scrapeService.scrapeProductPage(pokemonProducts, pagesDiscovered, pagesToScrape, i);
8585

86-
// incrementing the iteration number
87-
i++;
88-
}
86+
// incrementing the iteration number
87+
i++;
8988
}
90-
89+
}
9190
}
Lines changed: 30 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,49 +1,52 @@
11
package com.yen.scrpe.Task;
22

33
import com.yen.scrpe.service.BaseScrapeService;
4-
54
import java.io.IOException;
65

7-
/** Factory for constructing scraping job
6+
/**
7+
* Factory for constructing scraping job
88
*
9-
* Design pattern : Factory
9+
* <p>Design pattern : Factory
1010
*
11-
* - https://www.runoob.com/design-pattern/factory-pattern.html
11+
* <p>- https://www.runoob.com/design-pattern/factory-pattern.html
1212
*/
1313
public class ScrapeTaskFactory {
1414

15-
// attr
16-
private BaseScrapeService scrapeService;
15+
// attr
16+
private BaseScrapeService scrapeService;
1717

18-
private BaseScrapeTask scrapeTask;
18+
private BaseScrapeTask scrapeTask;
1919

20-
private String jobName;
20+
private String jobName;
2121

22-
private int limit;
22+
private int limit;
2323

24-
// constructor
25-
public ScrapeTaskFactory(){
24+
// constructor
25+
public ScrapeTaskFactory() {}
2626

27-
}
27+
public ScrapeTaskFactory(
28+
BaseScrapeService scrapeService, BaseScrapeTask scrapeTask, Integer limit) {
2829

29-
public ScrapeTaskFactory(BaseScrapeService scrapeService, BaseScrapeTask scrapeTask, Integer limit){
30+
this.scrapeService = scrapeService;
31+
this.scrapeTask = scrapeTask;
32+
this.limit = limit;
33+
}
3034

31-
this.scrapeService = scrapeService;
32-
this.scrapeTask = scrapeTask;
33-
this.limit = limit;
34-
}
35+
// method
36+
public void run() throws IOException, InterruptedException {
3537

36-
// method
37-
public void run() throws IOException {
38+
// this.jobName = "PokemonCollectTask";
39+
// System.out.println("this.jobName = " + this.jobName);
3840

39-
switch (this.jobName){
40-
case "PokemonCollectTask":
41-
// pokemonCollectTask.run(LIMIT);
42-
this.scrapeTask.run(this.limit);
43-
default:
44-
throw new RuntimeException("Not a valid task name :" + this.scrapeTask.toString());
41+
this.scrapeTask.run(this.limit);
4542

46-
}
47-
}
43+
// switch (this.jobName) {
44+
// case "PokemonCollectTask":
45+
// // pokemonCollectTask.run(LIMIT);
46+
// this.scrapeTask.run(this.limit);
47+
// default:
48+
// throw new RuntimeException("Not a valid task name :" + this.scrapeTask.toString());
49+
// }
50+
}
4851

4952
}
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
package com.yen.scrpe.Task;
2+
3+
import com.yen.scrpe.Task.PokemonCollectTask;
4+
import com.yen.scrpe.model.PokemonProduct;
5+
import com.yen.scrpe.service.ScrapeServiceMultiThreadV2Gpt;
6+
7+
import java.io.IOException;
8+
import java.util.HashSet;
9+
import java.util.LinkedList;
10+
import java.util.List;
11+
import java.util.Set;
12+
13+
public class ScrapeTaskFactory2 {
14+
15+
private final ScrapeServiceMultiThreadV2Gpt scrapeService;
16+
private final PokemonCollectTask pokemonCollectTask;
17+
private final int limit;
18+
19+
public ScrapeTaskFactory2(ScrapeServiceMultiThreadV2Gpt scrapeService, PokemonCollectTask pokemonCollectTask, int limit) {
20+
this.scrapeService = scrapeService;
21+
this.pokemonCollectTask = pokemonCollectTask;
22+
this.limit = limit;
23+
}
24+
25+
public void run() throws IOException, InterruptedException {
26+
List<PokemonProduct> pokemonProducts = new LinkedList<>();
27+
Set<String> pagesDiscovered = new HashSet<>();
28+
List<String> pagesToScrape = new LinkedList<>();
29+
pagesToScrape.add("https://scrapeme.live/shop");
30+
31+
//pokemonCollectTask.run(pokemonProducts, pagesDiscovered, pagesToScrape, limit);
32+
pokemonCollectTask.run(limit);
33+
34+
System.out.println("Scraping completed.");
35+
System.out.println("Collected Pokemon Products:");
36+
for (PokemonProduct product : pokemonProducts) {
37+
System.out.println(product);
38+
}
39+
40+
// Ensure the executor service is shut down
41+
scrapeService.shutdown();
42+
}
43+
44+
}

0 commit comments

Comments
 (0)