atrade/src/main/kotlin/service/DynamicNewsScraper.kt

package service

import com.microsoft.playwright.Playwright
import com.microsoft.playwright.BrowserType
import com.microsoft.playwright.Page
import com.microsoft.playwright.options.LoadState
import kotlinx.coroutines.async
import kotlinx.coroutines.awaitAll
import kotlinx.coroutines.coroutineScope
import kotlinx.coroutines.delay
import kotlinx.coroutines.sync.Semaphore
import kotlinx.coroutines.sync.withPermit
import model.NewsItem
import network.CorpInfo
import java.net.URL
import kotlin.random.Random

object DynamicNewsScraper {
    private val playwright by lazy { Playwright.create() }
    private val browser by lazy {
        playwright.chromium().launch(BrowserType.LaunchOptions().setHeadless(true))
    }

    fun extractSmartContentWithLineFilter(page: Page): String {
        val script = """
        () => {
    // 1. 선제적 노이즈 제거: 분석에 방해되는 태그들을 DOM에서 아예 삭제
    const junkTags = ['SCRIPT', 'STYLE', 'NOSCRIPT', 'IFRAME', 'SVG', 'HEADER', 'FOOTER', 'NAV'];
    document.querySelectorAll(junkTags.join(',')).forEach(el => el.remove());

    const MIN_LINE_LENGTH = 10;
    const MIN_TOTAL_LENGTH = 100;
    const CONSECUTIVE_THRESHOLD = 2;

    // 2. 라인별 정제 함수 (짧은 라인 연속 시 예외 처리)
    const getRefinedText = (el) => {
        // 실제 텍스트만 추출하여 라인별로 분리
        const lines = el.innerText.split('\n').map(l => l.trim()).filter(l => l.length > 0);
        let resultLines = [];
        let tempBuffer = [];
        let consecutiveShort = 0;

        lines.forEach(line => {
            if (line.length <= MIN_LINE_LENGTH) {
                consecutiveShort++;
                tempBuffer.push(line);
            } else {
                // 짧은 줄이 연속되지 않았을 때만 버퍼를 결과에 합침
                if (consecutiveShort < CONSECUTIVE_THRESHOLD) {
                    resultLines = resultLines.concat(tempBuffer);
                }
                resultLines.push(line);
                tempBuffer = [];
                consecutiveShort = 0;
            }
        });

        // 마지막 남은 버퍼 처리 (본문 끝에 짧은 정보가 있을 경우 대비)
        if (consecutiveShort < CONSECUTIVE_THRESHOLD) {
            resultLines = resultLines.concat(tempBuffer);
        }
        return resultLines.join('\n');
    };

    // 3. 후보 블록 탐색 및 텍스트 밀도 기반 분석
    const candidates = Array.from(document.querySelectorAll('div, section, article, p, main, td'))
        .map(el => ({
            el: el,
            refinedText: getRefinedText(el)
        }))
        .filter(item => {
            if (item.refinedText.length < MIN_TOTAL_LENGTH) return false;
            
            // 링크 밀도 체크: 기사 본문은 보통 링크보다 텍스트 비중이 훨씬 높음
            const linkLength = Array.from(item.el.querySelectorAll('a'))
                                    .reduce((acc, a) => acc + (a.innerText || "").length, 0);
            return (linkLength / item.refinedText.length) < 0.3;
        });

    // 4. 가장 최적의(가장 깊은 계층의) 본문 컨테이너 선정
    const best = candidates.find(parent => 
        !candidates.some(child => 
            parent.el !== child.el && 
            parent.el.contains(child.el) && 
            child.refinedText.length > parent.refinedText.length * 0.8
        )
    );

    return best ? best.refinedText : (candidates.sort((a,b) => b.refinedText.length - a.refinedText.length)[0]?.refinedText || "");
}
    """.trimIndent()

        return page.evaluate(script) as String
    }
    var failDomainList = arrayListOf<String>()
    suspend fun fetchFullContent(url: String): String {
        // browser.newContext().use { ... } 대신 직접 변수를 선언하고 제어합니다.
        val domain = URL(url).host
        val context = browser.newContext()
        if(failDomainList.contains(domain)) {
            println("실패한 도메인 스크래핑 종료 $domain ")
            return ""
        }
        return try {
            context.use { ctx ->
                ctx.newPage().use { page ->
                    delay(Random.nextInt(2000).toLong())

                    // 1. 리스너 설정 시 예외 처리 강화
                    blockUnnecessaryResources(page)

                    // 2. 타임아웃을 설정하여 무한 대기 방지
                    val options = Page.NavigateOptions().setTimeout(8000.0)
                    page.navigate(url, options)

                    // 3. 페이지가 완전히 닫히기 전에 모든 대기 중인 이벤트를 해제하기 위해 LOAD 상태 대기
                    page.waitForLoadState(LoadState.LOAD)

                    val content = cleanText(extractSmartContentWithLineFilter(page))

                    // 4. 명시적으로 route를 해제하여 close 시 발생할 수 있는 리스너 충돌 방지
                    page.unroute("**/*")

                    content
                }
            }
        } catch (e: Exception) {
            failDomainList.add(domain)
            println("❌ [Playwright] 스크래핑 실패 ($url): ${e.message}")
            ""
        } finally {
            // use 블록이 자원을 닫으려 할 때 발생하는 오류는 내부적으로 처리되거나 무시되도록 유도
        }
    }

    private fun blockUnnecessaryResources(page: Page) {
        // 이미지, 폰트, CSS 등 불필요한 요청 가로채서 중단
        page.route("**/*") { route ->
            try {
                val req = route.request()
                if (req != null) {
                    val type = req.resourceType()
                    if (type == "image" || type == "font" || type == "stylesheet") {
                        route.abort()
                    } else {
                        route.resume()
                    }
                } else {
                    // request가 이미 null이면 처리를 포기
                    route.resume()
                }
            } catch (e: Exception) {

            }
        }
    }

    private fun cleanText(text: String): String {
        return text.replace(Regex("(?m)^.*기자.*$"), "") // 기자 정보 제거
            .replace(Regex("(?m)^.*무단 전재.*$"), "") // 저작권 문구 제거
            .trim()
    }
}

object SafeScraper {
    // 동시 실행 브라우저 탭을 5개로 제한 (M3 Pro라면 10~20개도 여유롭습니다)
    private val semaphore = Semaphore(2)

    suspend fun scrapeParallel(corpInfo: CorpInfo,urls: List<NewsItem>) = coroutineScope {
        var query = "${corpInfo.cName} ${corpInfo.cCode} ${corpInfo.stockCode}"
        urls.map { item ->
            async {
                if (UrlCacheManager.isAlreadyProcessed(item.originallink) == false) {
                    try {
                        semaphore.withPermit {
                            try {
                                RagService.ingestWithChunking(
                                    text = DynamicNewsScraper.fetchFullContent(item.originallink),
                                    newsLink = item.originallink,
                                    pubDate = item.pubDate,
                                    stockCode = corpInfo.stockCode,
                                    corpName = corpInfo.cName,
                                    corpCode = corpInfo.cCode,
                                    stcokName = corpInfo.stockName
                                )
                            }catch (e: Exception) {
                                println("${e.message}")
                            }

                        }
                    }catch (e: Exception) {
                        println("${e.message}")
                    }
                    println("📰 '${query}' 관련 뉴스 새로운 학습 데이터 게더링")
                } else {
                    println("📰 '${query}' 관련 뉴스 기 학습 데이터 스킵")
                }
            }
        }.awaitAll()
        println("$query 관련 뉴스 ${urls.size}개 학습 완료")
    }
}
.... 2026-01-23 17:05:09 +09:00			`package service`

			`import com.microsoft.playwright.Playwright`
			`import com.microsoft.playwright.BrowserType`
			`import com.microsoft.playwright.Page`
... 2026-02-04 14:52:09 +09:00			`import com.microsoft.playwright.options.LoadState`
.... 2026-01-23 17:05:09 +09:00			`import kotlinx.coroutines.async`
			`import kotlinx.coroutines.awaitAll`
			`import kotlinx.coroutines.coroutineScope`
			`import kotlinx.coroutines.delay`
			`import kotlinx.coroutines.sync.Semaphore`
			`import kotlinx.coroutines.sync.withPermit`
			`import model.NewsItem`
			`import network.CorpInfo`
... 2026-02-05 14:26:02 +09:00			`import java.net.URL`
.... 2026-01-23 17:05:09 +09:00			`import kotlin.random.Random`

			`object DynamicNewsScraper {`
			`private val playwright by lazy { Playwright.create() }`
			`private val browser by lazy {`
			`playwright.chromium().launch(BrowserType.LaunchOptions().setHeadless(true))`
			`}`

			`fun extractSmartContentWithLineFilter(page: Page): String {`
			`val script = """`
			`() => {`
			`// 1. 선제적 노이즈 제거: 분석에 방해되는 태그들을 DOM에서 아예 삭제`
			`const junkTags = ['SCRIPT', 'STYLE', 'NOSCRIPT', 'IFRAME', 'SVG', 'HEADER', 'FOOTER', 'NAV'];`
			`document.querySelectorAll(junkTags.join(',')).forEach(el => el.remove());`

			`const MIN_LINE_LENGTH = 10;`
			`const MIN_TOTAL_LENGTH = 100;`
			`const CONSECUTIVE_THRESHOLD = 2;`

			`// 2. 라인별 정제 함수 (짧은 라인 연속 시 예외 처리)`
			`const getRefinedText = (el) => {`
			`// 실제 텍스트만 추출하여 라인별로 분리`
			`const lines = el.innerText.split('\n').map(l => l.trim()).filter(l => l.length > 0);`
			`let resultLines = [];`
			`let tempBuffer = [];`
			`let consecutiveShort = 0;`

			`lines.forEach(line => {`
			`if (line.length <= MIN_LINE_LENGTH) {`
			`consecutiveShort++;`
			`tempBuffer.push(line);`
			`} else {`
			`// 짧은 줄이 연속되지 않았을 때만 버퍼를 결과에 합침`
			`if (consecutiveShort < CONSECUTIVE_THRESHOLD) {`
			`resultLines = resultLines.concat(tempBuffer);`
			`}`
			`resultLines.push(line);`
			`tempBuffer = [];`
			`consecutiveShort = 0;`
			`}`
			`});`

			`// 마지막 남은 버퍼 처리 (본문 끝에 짧은 정보가 있을 경우 대비)`
			`if (consecutiveShort < CONSECUTIVE_THRESHOLD) {`
			`resultLines = resultLines.concat(tempBuffer);`
			`}`
			`return resultLines.join('\n');`
			`};`

			`// 3. 후보 블록 탐색 및 텍스트 밀도 기반 분석`
			`const candidates = Array.from(document.querySelectorAll('div, section, article, p, main, td'))`
			`.map(el => ({`
			`el: el,`
			`refinedText: getRefinedText(el)`
			`}))`
			`.filter(item => {`
			`if (item.refinedText.length < MIN_TOTAL_LENGTH) return false;`

			`// 링크 밀도 체크: 기사 본문은 보통 링크보다 텍스트 비중이 훨씬 높음`
			`const linkLength = Array.from(item.el.querySelectorAll('a'))`
			`.reduce((acc, a) => acc + (a.innerText \|\| "").length, 0);`
			`return (linkLength / item.refinedText.length) < 0.3;`
			`});`

			`// 4. 가장 최적의(가장 깊은 계층의) 본문 컨테이너 선정`
			`const best = candidates.find(parent =>`
			`!candidates.some(child =>`
			`parent.el !== child.el &&`
			`parent.el.contains(child.el) &&`
			`child.refinedText.length > parent.refinedText.length * 0.8`
			`)`
			`);`

			`return best ? best.refinedText : (candidates.sort((a,b) => b.refinedText.length - a.refinedText.length)[0]?.refinedText \|\| "");`
			`}`
			`""".trimIndent()`

			`return page.evaluate(script) as String`
			`}`
... 2026-02-05 14:26:02 +09:00			`var failDomainList = arrayListOf<String>()`
.... 2026-01-23 17:05:09 +09:00			`suspend fun fetchFullContent(url: String): String {`
... 2026-02-04 14:52:09 +09:00			`// browser.newContext().use { ... } 대신 직접 변수를 선언하고 제어합니다.`
... 2026-02-05 14:26:02 +09:00			`val domain = URL(url).host`
.... 2026-01-23 17:05:09 +09:00			`val context = browser.newContext()`
... 2026-02-05 14:26:02 +09:00			`if(failDomainList.contains(domain)) {`
			`println("실패한 도메인 스크래핑 종료 $domain ")`
			`return ""`
			`}`
.... 2026-01-23 17:05:09 +09:00			`return try {`
... 2026-02-04 14:52:09 +09:00			`context.use { ctx ->`
			`ctx.newPage().use { page ->`
... 2026-02-05 14:26:02 +09:00			`delay(Random.nextInt(2000).toLong())`
.... 2026-01-23 17:05:09 +09:00
... 2026-02-04 14:52:09 +09:00			`// 1. 리스너 설정 시 예외 처리 강화`
			`blockUnnecessaryResources(page)`
.... 2026-01-23 17:05:09 +09:00
... 2026-02-04 14:52:09 +09:00			`// 2. 타임아웃을 설정하여 무한 대기 방지`
... 2026-02-05 14:26:02 +09:00			`val options = Page.NavigateOptions().setTimeout(8000.0)`
... 2026-02-04 14:52:09 +09:00			`page.navigate(url, options)`

			`// 3. 페이지가 완전히 닫히기 전에 모든 대기 중인 이벤트를 해제하기 위해 LOAD 상태 대기`
			`page.waitForLoadState(LoadState.LOAD)`

			`val content = cleanText(extractSmartContentWithLineFilter(page))`

			`// 4. 명시적으로 route를 해제하여 close 시 발생할 수 있는 리스너 충돌 방지`
			`page.unroute("*/")`

			`content`
			`}`
			`}`
.... 2026-01-23 17:05:09 +09:00			`} catch (e: Exception) {`
... 2026-02-05 14:26:02 +09:00			`failDomainList.add(domain)`
... 2026-02-04 14:52:09 +09:00			`println("❌ [Playwright] 스크래핑 실패 ($url): ${e.message}")`
.... 2026-01-23 17:05:09 +09:00			`""`
			`} finally {`
... 2026-02-04 14:52:09 +09:00			`// use 블록이 자원을 닫으려 할 때 발생하는 오류는 내부적으로 처리되거나 무시되도록 유도`
.... 2026-01-23 17:05:09 +09:00			`}`
			`}`

			`private fun blockUnnecessaryResources(page: Page) {`
			`// 이미지, 폰트, CSS 등 불필요한 요청 가로채서 중단`
... 2026-02-04 14:52:09 +09:00			`page.route("*/") { route ->`
			`try {`
			`val req = route.request()`
			`if (req != null) {`
			`val type = req.resourceType()`
			`if (type == "image" \|\| type == "font" \|\| type == "stylesheet") {`
			`route.abort()`
			`} else {`
			`route.resume()`
			`}`
			`} else {`
			`// request가 이미 null이면 처리를 포기`
			`route.resume()`
			`}`
			`} catch (e: Exception) {`

			`}`
.... 2026-01-23 17:05:09 +09:00			`}`
			`}`

			`private fun cleanText(text: String): String {`
			`return text.replace(Regex("(?m)^.기자.$"), "") // 기자 정보 제거`
			`.replace(Regex("(?m)^.무단 전재.$"), "") // 저작권 문구 제거`
			`.trim()`
			`}`
			`}`

			`object SafeScraper {`
			`// 동시 실행 브라우저 탭을 5개로 제한 (M3 Pro라면 10~20개도 여유롭습니다)`
. 2026-01-26 15:32:03 +09:00			`private val semaphore = Semaphore(2)`
.... 2026-01-23 17:05:09 +09:00
			`suspend fun scrapeParallel(corpInfo: CorpInfo,urls: List<NewsItem>) = coroutineScope {`
			`var query = "${corpInfo.cName} ${corpInfo.cCode} ${corpInfo.stockCode}"`
			`urls.map { item ->`
			`async {`
			`if (UrlCacheManager.isAlreadyProcessed(item.originallink) == false) {`
... 2026-02-04 14:52:09 +09:00			`try {`
			`semaphore.withPermit {`
			`try {`
			`RagService.ingestWithChunking(`
			`text = DynamicNewsScraper.fetchFullContent(item.originallink),`
			`newsLink = item.originallink,`
			`pubDate = item.pubDate,`
			`stockCode = corpInfo.stockCode,`
			`corpName = corpInfo.cName,`
			`corpCode = corpInfo.cCode,`
			`stcokName = corpInfo.stockName`
			`)`
			`}catch (e: Exception) {`
			`println("${e.message}")`
			`}`

			`}`
			`}catch (e: Exception) {`
			`println("${e.message}")`
.... 2026-01-23 17:05:09 +09:00			`}`
			`println("📰 '${query}' 관련 뉴스 새로운 학습 데이터 게더링")`
			`} else {`
			`println("📰 '${query}' 관련 뉴스 기 학습 데이터 스킵")`
			`}`
			`}`
			`}.awaitAll()`
			`println("$query 관련 뉴스 ${urls.size}개 학습 완료")`
			`}`
			`}`