Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 25 additions & 17 deletions src/main/kotlin/com/wafflestudio/team2server/crawler/BaseCrawler.kt
Original file line number Diff line number Diff line change
Expand Up @@ -44,37 +44,34 @@ abstract class BaseCrawler(
* 이 메서드를 override 하여 사이트 특성에 맞는 탐색 로직을 직접 구현해야 합니다.
*/
open fun crawl() {
try {
val listDoc = fetch(listUrl)
val listDoc = fetch(listUrl)

val rows = getPostList(listDoc)
val rows = getPostList(listDoc)

for (row in rows) {
val rawLink = getPostLink(row)
val detailUrl = if (rawLink.startsWith("http")) rawLink else "$baseUrl$rawLink"
for (row in rows) {
val rawLink = getPostLink(row)
val detailUrl = if (rawLink.startsWith("http")) rawLink else "$baseUrl$rawLink"

if (articleRepository.existsByOriginLink(detailUrl)) {
continue
}
if (articleRepository.existsByOriginLink(detailUrl)) {
continue
}

val detailDoc = fetch(detailUrl)
val detailDoc = fetch(detailUrl)

val title = getPostTitle(row)
val title = getPostTitle(row)

val article = parseDetailAndGetArticle(targetBoardId, row, detailDoc, detailUrl, title)
val article = parseDetailAndGetArticle(targetBoardId, row, detailDoc, detailUrl, title)

articleService.saveNewArticle(article)
articleService.saveNewArticle(article)

Thread.sleep(500)
}
} catch (_: Exception) {
Thread.sleep(500)
}
}

protected fun fetch(url: String): Document =
Jsoup
.connect(url)
.userAgent("Mozilla/5.0 ...")
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
.timeout(10000)
.get()

Expand All @@ -86,4 +83,15 @@ abstract class BaseCrawler(
} catch (_: Exception) {
}
}

open fun runScheduled() {
try {
crawl()
updateExecutionTime()
} catch (e: Exception) {
e.printStackTrace()
val currentDatetime = Instant.now()
println("Error in crawler $code ${e.message} $currentDatetime")
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ class CareerCrawlerService(
try {
val localDate = LocalDate.parse(dateStr, detailDateFormatter)
localDate.atStartOfDay(ZoneId.of("Asia/Seoul")).toInstant()
} catch (e: Exception) {
} catch (_: Exception) {
Instant.now()
}

Expand All @@ -81,8 +81,7 @@ class CareerCrawlerService(
}

@Scheduled(fixedRate = 86400000)
fun runScheduled() {
crawl()
updateExecutionTime()
override fun runScheduled() {
super.runScheduled()
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -35,50 +35,44 @@ class CseCrawlerService(
var collectedCount = 0
val targetCount = 20

try {
while (collectedCount < targetCount) {
val pageUrl = "$listUrl?pageNum=$page"
while (collectedCount < targetCount) {
val pageUrl = "$listUrl?pageNum=$page"

val document = fetch(pageUrl)
val document = fetch(pageUrl)

val rows = getPostList(document)
if (rows.isEmpty()) break
val rows = getPostList(document)
if (rows.isEmpty()) break

for (row in rows) {
if (collectedCount >= targetCount) break
for (row in rows) {
if (collectedCount >= targetCount) break

val tempTitle = getPostTitle(row)
val tempTitle = getPostTitle(row)

if (tempTitle.isBlank()) {
continue
}

if (isPinnedPost(row)) {
continue
}
if (tempTitle.isBlank()) {
continue
}

val rawLink = getPostLink(row)
val detailUrl = if (rawLink.startsWith("http")) rawLink else "$baseUrl$rawLink"
if (isPinnedPost(row)) {
continue
}

if (!articleRepository.existsByOriginLink(detailUrl)) {
val title = getPostTitle(row)
val rawLink = getPostLink(row)
val detailUrl = if (rawLink.startsWith("http")) rawLink else "$baseUrl$rawLink"

val detailDoc = fetch(detailUrl)
if (!articleRepository.existsByOriginLink(detailUrl)) {
val title = getPostTitle(row)

val article = parseDetailAndGetArticle(targetBoardId, row, detailDoc, detailUrl, title)
articleService.saveNewArticle(article)
}
val detailDoc = fetch(detailUrl)

collectedCount++
val article = parseDetailAndGetArticle(targetBoardId, row, detailDoc, detailUrl, title)
articleService.saveNewArticle(article)
}

page++
Thread.sleep(500)
collectedCount++
}
} catch (e: Exception) {
e.printStackTrace()
} finally {
updateExecutionTime()

page++
Thread.sleep(500)
}
}

Expand Down Expand Up @@ -157,7 +151,7 @@ class CseCrawlerService(
}

@Scheduled(fixedRate = 3600000)
fun runScheduled() {
crawl()
override fun runScheduled() {
super.runScheduled()
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,7 @@ class MysnuCrawlerService(
}

@Scheduled(fixedRate = 3600000)
fun runScheduled() {
crawl()
updateExecutionTime()
override fun runScheduled() {
super.runScheduled()
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -127,36 +127,32 @@ class SnutiCrawlerService(
}

override fun crawl() {
try {
val listDoc = fetch(listUrl)
val listDoc = fetch(listUrl)

val rows = getPostList(listDoc)
val rows = getPostList(listDoc)

for (row in rows) {
val rawLink = getPostLink(row)
val detailUrl = if (rawLink.startsWith("http")) rawLink else "$baseUrl$rawLink"
for (row in rows) {
val rawLink = getPostLink(row)
val detailUrl = if (rawLink.startsWith("http")) rawLink else "$baseUrl$rawLink"

if (articleRepository.existsByOriginLink(detailUrl)) {
continue
}
if (articleRepository.existsByOriginLink(detailUrl)) {
continue
}

val detailDoc = fetchDetail(detailUrl)
val detailDoc = fetchDetail(detailUrl)

val title = getPostTitle(row)
val title = getPostTitle(row)

val article = parseDetailAndGetArticle(targetBoardId, row, detailDoc, detailUrl, title)
val article = parseDetailAndGetArticle(targetBoardId, row, detailDoc, detailUrl, title)

articleService.saveNewArticle(article)
articleService.saveNewArticle(article)

Thread.sleep(500)
}
} catch (_: Exception) {
Thread.sleep(500)
}
}

@Scheduled(fixedRate = 3600000)
fun runScheduled() {
crawl()
updateExecutionTime()
override fun runScheduled() {
super.runScheduled()
}
}