Skip to content

Commit 3c8417f

Browse files
authored
⚡ 크롤러 시각 업데이트 로직 개선 (#36)
1 parent 522b510 commit 3c8417f

File tree

5 files changed

+72
-76
lines changed

5 files changed

+72
-76
lines changed

src/main/kotlin/com/wafflestudio/team2server/crawler/BaseCrawler.kt

Lines changed: 25 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -44,37 +44,34 @@ abstract class BaseCrawler(
4444
* 이 메서드를 override 하여 사이트 특성에 맞는 탐색 로직을 직접 구현해야 합니다.
4545
*/
4646
open fun crawl() {
47-
try {
48-
val listDoc = fetch(listUrl)
47+
val listDoc = fetch(listUrl)
4948

50-
val rows = getPostList(listDoc)
49+
val rows = getPostList(listDoc)
5150

52-
for (row in rows) {
53-
val rawLink = getPostLink(row)
54-
val detailUrl = if (rawLink.startsWith("http")) rawLink else "$baseUrl$rawLink"
51+
for (row in rows) {
52+
val rawLink = getPostLink(row)
53+
val detailUrl = if (rawLink.startsWith("http")) rawLink else "$baseUrl$rawLink"
5554

56-
if (articleRepository.existsByOriginLink(detailUrl)) {
57-
continue
58-
}
55+
if (articleRepository.existsByOriginLink(detailUrl)) {
56+
continue
57+
}
5958

60-
val detailDoc = fetch(detailUrl)
59+
val detailDoc = fetch(detailUrl)
6160

62-
val title = getPostTitle(row)
61+
val title = getPostTitle(row)
6362

64-
val article = parseDetailAndGetArticle(targetBoardId, row, detailDoc, detailUrl, title)
63+
val article = parseDetailAndGetArticle(targetBoardId, row, detailDoc, detailUrl, title)
6564

66-
articleService.saveNewArticle(article)
65+
articleService.saveNewArticle(article)
6766

68-
Thread.sleep(500)
69-
}
70-
} catch (_: Exception) {
67+
Thread.sleep(500)
7168
}
7269
}
7370

7471
protected fun fetch(url: String): Document =
7572
Jsoup
7673
.connect(url)
77-
.userAgent("Mozilla/5.0 ...")
74+
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
7875
.timeout(10000)
7976
.get()
8077

@@ -86,4 +83,15 @@ abstract class BaseCrawler(
8683
} catch (_: Exception) {
8784
}
8885
}
86+
87+
open fun runScheduled() {
88+
try {
89+
crawl()
90+
updateExecutionTime()
91+
} catch (e: Exception) {
92+
e.printStackTrace()
93+
val currentDatetime = Instant.now()
94+
println("Error in crawler $code ${e.message} $currentDatetime")
95+
}
96+
}
8997
}

src/main/kotlin/com/wafflestudio/team2server/crawler/service/CareerCrawlerService.kt

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ class CareerCrawlerService(
6464
try {
6565
val localDate = LocalDate.parse(dateStr, detailDateFormatter)
6666
localDate.atStartOfDay(ZoneId.of("Asia/Seoul")).toInstant()
67-
} catch (e: Exception) {
67+
} catch (_: Exception) {
6868
Instant.now()
6969
}
7070

@@ -81,8 +81,7 @@ class CareerCrawlerService(
8181
}
8282

8383
@Scheduled(fixedRate = 86400000)
84-
fun runScheduled() {
85-
crawl()
86-
updateExecutionTime()
84+
override fun runScheduled() {
85+
super.runScheduled()
8786
}
8887
}

src/main/kotlin/com/wafflestudio/team2server/crawler/service/CseCrawlerService.kt

Lines changed: 27 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -35,50 +35,44 @@ class CseCrawlerService(
3535
var collectedCount = 0
3636
val targetCount = 20
3737

38-
try {
39-
while (collectedCount < targetCount) {
40-
val pageUrl = "$listUrl?pageNum=$page"
38+
while (collectedCount < targetCount) {
39+
val pageUrl = "$listUrl?pageNum=$page"
4140

42-
val document = fetch(pageUrl)
41+
val document = fetch(pageUrl)
4342

44-
val rows = getPostList(document)
45-
if (rows.isEmpty()) break
43+
val rows = getPostList(document)
44+
if (rows.isEmpty()) break
4645

47-
for (row in rows) {
48-
if (collectedCount >= targetCount) break
46+
for (row in rows) {
47+
if (collectedCount >= targetCount) break
4948

50-
val tempTitle = getPostTitle(row)
49+
val tempTitle = getPostTitle(row)
5150

52-
if (tempTitle.isBlank()) {
53-
continue
54-
}
55-
56-
if (isPinnedPost(row)) {
57-
continue
58-
}
51+
if (tempTitle.isBlank()) {
52+
continue
53+
}
5954

60-
val rawLink = getPostLink(row)
61-
val detailUrl = if (rawLink.startsWith("http")) rawLink else "$baseUrl$rawLink"
55+
if (isPinnedPost(row)) {
56+
continue
57+
}
6258

63-
if (!articleRepository.existsByOriginLink(detailUrl)) {
64-
val title = getPostTitle(row)
59+
val rawLink = getPostLink(row)
60+
val detailUrl = if (rawLink.startsWith("http")) rawLink else "$baseUrl$rawLink"
6561

66-
val detailDoc = fetch(detailUrl)
62+
if (!articleRepository.existsByOriginLink(detailUrl)) {
63+
val title = getPostTitle(row)
6764

68-
val article = parseDetailAndGetArticle(targetBoardId, row, detailDoc, detailUrl, title)
69-
articleService.saveNewArticle(article)
70-
}
65+
val detailDoc = fetch(detailUrl)
7166

72-
collectedCount++
67+
val article = parseDetailAndGetArticle(targetBoardId, row, detailDoc, detailUrl, title)
68+
articleService.saveNewArticle(article)
7369
}
7470

75-
page++
76-
Thread.sleep(500)
71+
collectedCount++
7772
}
78-
} catch (e: Exception) {
79-
e.printStackTrace()
80-
} finally {
81-
updateExecutionTime()
73+
74+
page++
75+
Thread.sleep(500)
8276
}
8377
}
8478

@@ -157,7 +151,7 @@ class CseCrawlerService(
157151
}
158152

159153
@Scheduled(fixedRate = 3600000)
160-
fun runScheduled() {
161-
crawl()
154+
override fun runScheduled() {
155+
super.runScheduled()
162156
}
163157
}

src/main/kotlin/com/wafflestudio/team2server/crawler/service/MysnuCrawlerService.kt

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,7 @@ class MysnuCrawlerService(
8181
}
8282

8383
@Scheduled(fixedRate = 3600000)
84-
fun runScheduled() {
85-
crawl()
86-
updateExecutionTime()
84+
override fun runScheduled() {
85+
super.runScheduled()
8786
}
8887
}

src/main/kotlin/com/wafflestudio/team2server/crawler/service/SnutiCrawlerService.kt

Lines changed: 15 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -127,36 +127,32 @@ class SnutiCrawlerService(
127127
}
128128

129129
override fun crawl() {
130-
try {
131-
val listDoc = fetch(listUrl)
130+
val listDoc = fetch(listUrl)
132131

133-
val rows = getPostList(listDoc)
132+
val rows = getPostList(listDoc)
134133

135-
for (row in rows) {
136-
val rawLink = getPostLink(row)
137-
val detailUrl = if (rawLink.startsWith("http")) rawLink else "$baseUrl$rawLink"
134+
for (row in rows) {
135+
val rawLink = getPostLink(row)
136+
val detailUrl = if (rawLink.startsWith("http")) rawLink else "$baseUrl$rawLink"
138137

139-
if (articleRepository.existsByOriginLink(detailUrl)) {
140-
continue
141-
}
138+
if (articleRepository.existsByOriginLink(detailUrl)) {
139+
continue
140+
}
142141

143-
val detailDoc = fetchDetail(detailUrl)
142+
val detailDoc = fetchDetail(detailUrl)
144143

145-
val title = getPostTitle(row)
144+
val title = getPostTitle(row)
146145

147-
val article = parseDetailAndGetArticle(targetBoardId, row, detailDoc, detailUrl, title)
146+
val article = parseDetailAndGetArticle(targetBoardId, row, detailDoc, detailUrl, title)
148147

149-
articleService.saveNewArticle(article)
148+
articleService.saveNewArticle(article)
150149

151-
Thread.sleep(500)
152-
}
153-
} catch (_: Exception) {
150+
Thread.sleep(500)
154151
}
155152
}
156153

157154
@Scheduled(fixedRate = 3600000)
158-
fun runScheduled() {
159-
crawl()
160-
updateExecutionTime()
155+
override fun runScheduled() {
156+
super.runScheduled()
161157
}
162158
}

0 commit comments

Comments
 (0)