Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ dependencies {
implementation("org.jetbrains.kotlin:kotlin-reflect")
implementation("tools.jackson.module:jackson-module-kotlin")
implementation("org.springdoc:springdoc-openapi-starter-webmvc-ui:2.7.0")
implementation("org.jsoup:jsoup:1.17.2")
compileOnly("org.projectlombok:lombok")
developmentOnly("org.springframework.boot:spring-boot-docker-compose")
runtimeOnly("com.mysql:mysql-connector-j")
Expand Down
2 changes: 2 additions & 0 deletions src/main/kotlin/com/wafflestudio/team2server/Application.kt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@ package com.wafflestudio.team2server

import org.springframework.boot.autoconfigure.SpringBootApplication
import org.springframework.boot.runApplication
import org.springframework.scheduling.annotation.EnableScheduling

@EnableScheduling
@SpringBootApplication
class Application

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
package com.wafflestudio.team2server.article.model

import org.springframework.data.annotation.Id
import org.springframework.data.relational.core.mapping.Table
import java.time.Instant

@Table("articles")
data class Article(
@Id
val id: Long? = null,
val boardId: Long,
val content: String,
val author: String,
val title: String,
val originLink: String,
val publishedAt: Instant,
val createdAt: Instant? = null,
val updatedAt: Instant? = null,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
package com.wafflestudio.team2server.article.repository

import com.wafflestudio.team2server.article.model.Article
import org.springframework.data.repository.ListCrudRepository

interface ArticleRepository : ListCrudRepository<Article, Long>
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
package com.wafflestudio.team2server.crawler

import com.wafflestudio.team2server.article.model.Article
import com.wafflestudio.team2server.article.repository.ArticleRepository
import com.wafflestudio.team2server.crawler.repository.CrawlerRepository
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
import org.jsoup.nodes.Element
import java.time.Instant

abstract class BaseCrawler(
private val crawlerRepository: CrawlerRepository,
private val articleRepository: ArticleRepository,
) {
abstract val listUrl: String
abstract val baseUrl: String
abstract val targetBoardId: Long
abstract val code: String
abstract val crawlIntervalSeconds: Long

protected abstract fun getPostList(document: Document): List<Element>

protected abstract fun getPostTitle(element: Element): String

protected abstract fun getPostLink(element: Element): String

protected abstract fun parseDetailAndGetArticle(
boardId: Long,
element: Element,
detailDoc: Document,
url: String,
title: String,
): Article

/**
* 해당 사이트의 게시글 목록을 가져와 크롤링을 수행합니다.
*
* 기본 구현은 목록을 한 번만 조회(fetch)하도록 되어 있습니다.
* 대부분의 사이트는 최신 공지가 첫 페이지에 위치하므로, 별도의 수정 없이 이 메서드를 사용하면 됩니다.
*
* 단, '컴퓨터공학부' 사이트와 같이 공지사항 확인을 위해 여러 페이지를 넘겨야(pagination) 하는 경우,
* 이 메서드를 override 하여 사이트 특성에 맞는 탐색 로직을 직접 구현해야 합니다.
*/
open fun crawl() {
try {
val listDoc = fetch(listUrl)

val rows = getPostList(listDoc)

for (row in rows) {
val rawLink = getPostLink(row)
val detailUrl = if (rawLink.startsWith("http")) rawLink else "$baseUrl$rawLink"

if (crawlerRepository.existsByOriginLink(detailUrl)) {
continue
}

val detailDoc = fetch(detailUrl)

val title = getPostTitle(row)

val article = parseDetailAndGetArticle(targetBoardId, row, detailDoc, detailUrl, title)

articleRepository.save(article)

Thread.sleep(500)
}
} catch (e: Exception) {
}
}

private fun fetch(url: String): Document =
Jsoup
.connect(url)
.userAgent("Mozilla/5.0 ...")
.timeout(10000)
.get()

protected fun updateExecutionTime() {
try {
val now = Instant.now()
val next = now.plusSeconds(crawlIntervalSeconds)
crawlerRepository.updateLastCrawledAt(targetBoardId, now, next)
} catch (e: Exception) {
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
package com.wafflestudio.team2server.crawler.controller

import com.wafflestudio.team2server.crawler.BaseCrawler
import org.springframework.http.ResponseEntity
import org.springframework.web.bind.annotation.PathVariable
import org.springframework.web.bind.annotation.PostMapping
import org.springframework.web.bind.annotation.RequestMapping
import org.springframework.web.bind.annotation.RestController

@RestController
@RequestMapping("/api/crawlers")
class CrawlerController(
private val crawlers: List<BaseCrawler>,
) {
@PostMapping("/{crawlerCode}/run")
fun manualRun(
@PathVariable crawlerCode: String,
): ResponseEntity<String> {
val targetCrawler =
crawlers.find { it.code == crawlerCode }
?: return ResponseEntity.notFound().build()

try {
targetCrawler.crawl()
return ResponseEntity.ok("크롤러($crawlerCode) 실행 완료!")
} catch (e: Exception) {
return ResponseEntity.internalServerError().body(" 실행 실패: ${e.message}")
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
package com.wafflestudio.team2server.crawler.model

import org.springframework.data.annotation.Id
import org.springframework.data.relational.core.mapping.Table
import java.time.Instant

@Table("crawlers")
data class Crawler(
@Id
val id: Long? = null,
val boardId: Long,
val nextUpdateAt: Instant,
val createdAt: Instant? = null,
val updatedAt: Instant? = null,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
package com.wafflestudio.team2server.crawler.repository

import com.wafflestudio.team2server.article.model.Article
import org.springframework.data.jdbc.repository.query.Modifying
import org.springframework.data.jdbc.repository.query.Query
import org.springframework.data.repository.ListCrudRepository
import org.springframework.data.repository.query.Param
import java.time.Instant

interface CrawlerRepository : ListCrudRepository<Article, Long> {
fun existsByOriginLink(originLink: String): Boolean

@Modifying
@Query("UPDATE crawlers SET updated_at = :now, next_update_at = :next WHERE board_id = :boardId")
fun updateLastCrawledAt(
@Param("boardId") boardId: Long,
@Param("now") now: Instant,
@Param("next") nextUpdateAt: Instant,
)
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
package com.wafflestudio.team2server.crawler.service

import com.wafflestudio.team2server.article.model.Article
import com.wafflestudio.team2server.article.repository.ArticleRepository
import com.wafflestudio.team2server.crawler.BaseCrawler
import com.wafflestudio.team2server.crawler.repository.CrawlerRepository
import org.jsoup.nodes.Document
import org.jsoup.nodes.Element
import org.springframework.scheduling.annotation.Scheduled
import org.springframework.stereotype.Service
import java.time.Instant
import java.time.LocalDateTime
import java.time.ZoneId
import java.time.format.DateTimeFormatter

@Service
class MysnuCrawlerService(
private val crawlerRepository: CrawlerRepository,
private val articleRepository: ArticleRepository,
) : BaseCrawler(crawlerRepository, articleRepository) {
override val listUrl = "https://my.snu.ac.kr/ctt/bb/bulletin?b=1&ls=20&ln=1&dm=m&inB=&inPx="
override val baseUrl = "https://my.snu.ac.kr"
override val targetBoardId = 2L
override val code = "mysnu"
override val crawlIntervalSeconds = 3600L

val formatter = DateTimeFormatter.ofPattern("yyyy.MM.dd HH:mm:ss")

override fun getPostList(document: Document): List<Element> {
val list = document.select("ul[data-name='post_list']")
return list
}

override fun getPostTitle(element: Element): String = element.select("li.bc-s-title .postT span").text().trim()

override fun getPostLink(element: Element): String {
val rawUrl = element.attr("data-url")
return if (rawUrl.startsWith("http")) rawUrl else "$baseUrl$rawUrl"
}

override fun parseDetailAndGetArticle(
boardId: Long,
element: Element,
detailDoc: Document,
url: String,
title: String,
): Article {
val content = detailDoc.select("div.text_area").html()

var author = element.select("li.bc-s-cre_user_name").text().trim()
var dateStr = ""

val metaList = detailDoc.select(".sub_search_box.type03 dl")
for (dl in metaList) {
val label = dl.select("dt").text()
val value = dl.select("dd").text()
if (label.contains("Create User")) author = value
if (label.contains("Created Date")) dateStr = value
}

val publishedAt =
try {
val localDateTime = LocalDateTime.parse(dateStr, formatter)
localDateTime.atZone(ZoneId.of("Asia/Seoul")).toInstant()
} catch (e: Exception) {
Instant.now()
}

return Article(
boardId = boardId,
title = title,
content = content,
author = author,
originLink = url,
publishedAt = publishedAt,
createdAt = Instant.now(),
updatedAt = Instant.now(),
)
}

@Scheduled(fixedRate = 3600000)
fun runScheduled() {
crawl()
updateExecutionTime()
}
}
17 changes: 17 additions & 0 deletions src/main/resources/db/migration/V2__add_boards_and_crawlers.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@

ALTER TABLE articles MODIFY COLUMN content LONGTEXT NOT NULL;

ALTER TABLE articles ADD COLUMN title VARCHAR(500) NOT NULL AFTER board_id;

ALTER TABLE crawlers ADD COLUMN code VARCHAR(50) NOT NULL UNIQUE AFTER id;

INSERT INTO boards (id, name, source_url)
VALUES (1, 'Service Notice', NULL);


INSERT INTO boards (id, name, source_url)
VALUES (2, 'MySNU Notice', 'https://my.snu.ac.kr/ctt/bb/bulletin?b=1');


INSERT INTO crawlers (id, code, board_id)
VALUES (1, 'mysnu', 2);
35 changes: 35 additions & 0 deletions src/test/kotlin/com/wafflestudio/team2server/CrawlerTests.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
package com.wafflestudio.team2server

import com.wafflestudio.team2server.crawler.controller.CrawlerController
import com.wafflestudio.team2server.crawler.service.MysnuCrawlerService
import org.junit.jupiter.api.Test
import org.mockito.BDDMockito.given
import org.mockito.Mockito.verify
import org.springframework.beans.factory.annotation.Autowired
import org.springframework.boot.webmvc.test.autoconfigure.WebMvcTest
import org.springframework.test.context.bean.override.mockito.MockitoBean
import org.springframework.test.web.servlet.MockMvc
import org.springframework.test.web.servlet.request.MockMvcRequestBuilders.post
import org.springframework.test.web.servlet.result.MockMvcResultMatchers.status

@WebMvcTest(CrawlerController::class)
class CrawlerTests
@Autowired
constructor(
private val mvc: MockMvc,
) {
@MockitoBean
private lateinit var mysnuCrawlerService: MysnuCrawlerService

@Test
fun `크롤러 수동 실행 성공`() {
given(mysnuCrawlerService.code).willReturn("mysnu")

mvc
.perform(
post("/api/crawlers/mysnu/run"),
).andExpect(status().isOk)

verify(mysnuCrawlerService).crawl()
}
}