atrade/src/main/kotlin/service/DynamicNewsScraper.kt

package service

import com.microsoft.playwright.Playwright
import com.microsoft.playwright.BrowserType
import com.microsoft.playwright.Page
import kotlinx.coroutines.async
import kotlinx.coroutines.awaitAll
import kotlinx.coroutines.coroutineScope
import kotlinx.coroutines.delay
import kotlinx.coroutines.sync.Semaphore
import kotlinx.coroutines.sync.withPermit
import model.NewsItem
import network.CorpInfo
import kotlin.random.Random

object DynamicNewsScraper {
    private val playwright by lazy { Playwright.create() }
    private val browser by lazy {
        playwright.chromium().launch(BrowserType.LaunchOptions().setHeadless(true))
    }

    fun extractSmartContentWithLineFilter(page: Page): String {
        val script = """
        () => {
    // 1. 선제적 노이즈 제거: 분석에 방해되는 태그들을 DOM에서 아예 삭제
    const junkTags = ['SCRIPT', 'STYLE', 'NOSCRIPT', 'IFRAME', 'SVG', 'HEADER', 'FOOTER', 'NAV'];
    document.querySelectorAll(junkTags.join(',')).forEach(el => el.remove());

    const MIN_LINE_LENGTH = 10;
    const MIN_TOTAL_LENGTH = 100;
    const CONSECUTIVE_THRESHOLD = 2;

    // 2. 라인별 정제 함수 (짧은 라인 연속 시 예외 처리)
    const getRefinedText = (el) => {
        // 실제 텍스트만 추출하여 라인별로 분리
        const lines = el.innerText.split('\n').map(l => l.trim()).filter(l => l.length > 0);
        let resultLines = [];
        let tempBuffer = [];
        let consecutiveShort = 0;

        lines.forEach(line => {
            if (line.length <= MIN_LINE_LENGTH) {
                consecutiveShort++;
                tempBuffer.push(line);
            } else {
                // 짧은 줄이 연속되지 않았을 때만 버퍼를 결과에 합침
                if (consecutiveShort < CONSECUTIVE_THRESHOLD) {
                    resultLines = resultLines.concat(tempBuffer);
                }
                resultLines.push(line);
                tempBuffer = [];
                consecutiveShort = 0;
            }
        });

        // 마지막 남은 버퍼 처리 (본문 끝에 짧은 정보가 있을 경우 대비)
        if (consecutiveShort < CONSECUTIVE_THRESHOLD) {
            resultLines = resultLines.concat(tempBuffer);
        }
        return resultLines.join('\n');
    };

    // 3. 후보 블록 탐색 및 텍스트 밀도 기반 분석
    const candidates = Array.from(document.querySelectorAll('div, section, article, p, main, td'))
        .map(el => ({
            el: el,
            refinedText: getRefinedText(el)
        }))
        .filter(item => {
            if (item.refinedText.length < MIN_TOTAL_LENGTH) return false;
            
            // 링크 밀도 체크: 기사 본문은 보통 링크보다 텍스트 비중이 훨씬 높음
            const linkLength = Array.from(item.el.querySelectorAll('a'))
                                    .reduce((acc, a) => acc + (a.innerText || "").length, 0);
            return (linkLength / item.refinedText.length) < 0.3;
        });

    // 4. 가장 최적의(가장 깊은 계층의) 본문 컨테이너 선정
    const best = candidates.find(parent => 
        !candidates.some(child => 
            parent.el !== child.el && 
            parent.el.contains(child.el) && 
            child.refinedText.length > parent.refinedText.length * 0.8
        )
    );

    return best ? best.refinedText : (candidates.sort((a,b) => b.refinedText.length - a.refinedText.length)[0]?.refinedText || "");
}
    """.trimIndent()

        return page.evaluate(script) as String
    }

    suspend fun fetchFullContent(url: String): String {
        val context = browser.newContext()
        val page = context.newPage()
        delay(Random.nextInt(1000).toLong())
        return try {
            // 1. 페이지 이동 및 네트워크 유휴 상태까지 대기
            blockUnnecessaryResources(page)
            page.navigate(url)
//            println(url)
            page.waitForLoadState()


            var finded = cleanText(extractSmartContentWithLineFilter(page))
            println("finded : $finded")
            finded
        } catch (e: Exception) {
            println("❌ [Playwright] 스크래핑 실패: ${e.message}")
            ""
        } finally {
            page.close()
            context.close()
        }
    }

    private fun blockUnnecessaryResources(page: Page) {
        // 이미지, 폰트, CSS 등 불필요한 요청 가로채서 중단
        page.route("**/*.{png,jpg,jpeg,gif,webp,svg,css,woff,woff2}") { route ->
            route.abort()
        }
    }

    private fun cleanText(text: String): String {
        return text.replace(Regex("(?m)^.*기자.*$"), "") // 기자 정보 제거
            .replace(Regex("(?m)^.*무단 전재.*$"), "") // 저작권 문구 제거
            .trim()
    }
}

object SafeScraper {
    // 동시 실행 브라우저 탭을 5개로 제한 (M3 Pro라면 10~20개도 여유롭습니다)
    private val semaphore = Semaphore(2)

    suspend fun scrapeParallel(corpInfo: CorpInfo,urls: List<NewsItem>) = coroutineScope {
        var query = "${corpInfo.cName} ${corpInfo.cCode} ${corpInfo.stockCode}"
        urls.map { item ->
            async {
                if (UrlCacheManager.isAlreadyProcessed(item.originallink) == false) {
                    semaphore.withPermit {
                        RagService.ingestWithChunking(
                            text = DynamicNewsScraper.fetchFullContent(item.originallink),
                            newsLink = item.originallink,
                            pubDate = item.pubDate,
                            stockCode = corpInfo.stockCode,
                            corpName = corpInfo.cName,
                            corpCode = corpInfo.cCode,
                            stcokName = corpInfo.stockName
                        )
                    }
                    println("📰 '${query}' 관련 뉴스 새로운 학습 데이터 게더링")
                } else {
                    println("📰 '${query}' 관련 뉴스 기 학습 데이터 스킵")
                }
            }
        }.awaitAll()
        println("$query 관련 뉴스 ${urls.size}개 학습 완료")
    }
}
.... 2026-01-23 17:05:09 +09:00			`package service`

			`import com.microsoft.playwright.Playwright`
			`import com.microsoft.playwright.BrowserType`
			`import com.microsoft.playwright.Page`
			`import kotlinx.coroutines.async`
			`import kotlinx.coroutines.awaitAll`
			`import kotlinx.coroutines.coroutineScope`
			`import kotlinx.coroutines.delay`
			`import kotlinx.coroutines.sync.Semaphore`
			`import kotlinx.coroutines.sync.withPermit`
			`import model.NewsItem`
			`import network.CorpInfo`
			`import kotlin.random.Random`

			`object DynamicNewsScraper {`
			`private val playwright by lazy { Playwright.create() }`
			`private val browser by lazy {`
			`playwright.chromium().launch(BrowserType.LaunchOptions().setHeadless(true))`
			`}`

			`fun extractSmartContentWithLineFilter(page: Page): String {`
			`val script = """`
			`() => {`
			`// 1. 선제적 노이즈 제거: 분석에 방해되는 태그들을 DOM에서 아예 삭제`
			`const junkTags = ['SCRIPT', 'STYLE', 'NOSCRIPT', 'IFRAME', 'SVG', 'HEADER', 'FOOTER', 'NAV'];`
			`document.querySelectorAll(junkTags.join(',')).forEach(el => el.remove());`

			`const MIN_LINE_LENGTH = 10;`
			`const MIN_TOTAL_LENGTH = 100;`
			`const CONSECUTIVE_THRESHOLD = 2;`

			`// 2. 라인별 정제 함수 (짧은 라인 연속 시 예외 처리)`
			`const getRefinedText = (el) => {`
			`// 실제 텍스트만 추출하여 라인별로 분리`
			`const lines = el.innerText.split('\n').map(l => l.trim()).filter(l => l.length > 0);`
			`let resultLines = [];`
			`let tempBuffer = [];`
			`let consecutiveShort = 0;`

			`lines.forEach(line => {`
			`if (line.length <= MIN_LINE_LENGTH) {`
			`consecutiveShort++;`
			`tempBuffer.push(line);`
			`} else {`
			`// 짧은 줄이 연속되지 않았을 때만 버퍼를 결과에 합침`
			`if (consecutiveShort < CONSECUTIVE_THRESHOLD) {`
			`resultLines = resultLines.concat(tempBuffer);`
			`}`
			`resultLines.push(line);`
			`tempBuffer = [];`
			`consecutiveShort = 0;`
			`}`
			`});`

			`// 마지막 남은 버퍼 처리 (본문 끝에 짧은 정보가 있을 경우 대비)`
			`if (consecutiveShort < CONSECUTIVE_THRESHOLD) {`
			`resultLines = resultLines.concat(tempBuffer);`
			`}`
			`return resultLines.join('\n');`
			`};`

			`// 3. 후보 블록 탐색 및 텍스트 밀도 기반 분석`
			`const candidates = Array.from(document.querySelectorAll('div, section, article, p, main, td'))`
			`.map(el => ({`
			`el: el,`
			`refinedText: getRefinedText(el)`
			`}))`
			`.filter(item => {`
			`if (item.refinedText.length < MIN_TOTAL_LENGTH) return false;`

			`// 링크 밀도 체크: 기사 본문은 보통 링크보다 텍스트 비중이 훨씬 높음`
			`const linkLength = Array.from(item.el.querySelectorAll('a'))`
			`.reduce((acc, a) => acc + (a.innerText \|\| "").length, 0);`
			`return (linkLength / item.refinedText.length) < 0.3;`
			`});`

			`// 4. 가장 최적의(가장 깊은 계층의) 본문 컨테이너 선정`
			`const best = candidates.find(parent =>`
			`!candidates.some(child =>`
			`parent.el !== child.el &&`
			`parent.el.contains(child.el) &&`
			`child.refinedText.length > parent.refinedText.length * 0.8`
			`)`
			`);`

			`return best ? best.refinedText : (candidates.sort((a,b) => b.refinedText.length - a.refinedText.length)[0]?.refinedText \|\| "");`
			`}`
			`""".trimIndent()`

			`return page.evaluate(script) as String`
			`}`

			`suspend fun fetchFullContent(url: String): String {`
			`val context = browser.newContext()`
			`val page = context.newPage()`
			`delay(Random.nextInt(1000).toLong())`
			`return try {`
			`// 1. 페이지 이동 및 네트워크 유휴 상태까지 대기`
			`blockUnnecessaryResources(page)`
			`page.navigate(url)`
			`// println(url)`
			`page.waitForLoadState()`


			`var finded = cleanText(extractSmartContentWithLineFilter(page))`
			`println("finded : $finded")`
			`finded`
			`} catch (e: Exception) {`
			`println("❌ [Playwright] 스크래핑 실패: ${e.message}")`
			`""`
			`} finally {`
			`page.close()`
			`context.close()`
			`}`
			`}`

			`private fun blockUnnecessaryResources(page: Page) {`
			`// 이미지, 폰트, CSS 등 불필요한 요청 가로채서 중단`
			`page.route("*/.{png,jpg,jpeg,gif,webp,svg,css,woff,woff2}") { route ->`
			`route.abort()`
			`}`
			`}`

			`private fun cleanText(text: String): String {`
			`return text.replace(Regex("(?m)^.기자.$"), "") // 기자 정보 제거`
			`.replace(Regex("(?m)^.무단 전재.$"), "") // 저작권 문구 제거`
			`.trim()`
			`}`
			`}`

			`object SafeScraper {`
			`// 동시 실행 브라우저 탭을 5개로 제한 (M3 Pro라면 10~20개도 여유롭습니다)`
. 2026-01-26 15:32:03 +09:00			`private val semaphore = Semaphore(2)`
.... 2026-01-23 17:05:09 +09:00
			`suspend fun scrapeParallel(corpInfo: CorpInfo,urls: List<NewsItem>) = coroutineScope {`
			`var query = "${corpInfo.cName} ${corpInfo.cCode} ${corpInfo.stockCode}"`
			`urls.map { item ->`
			`async {`
			`if (UrlCacheManager.isAlreadyProcessed(item.originallink) == false) {`
			`semaphore.withPermit {`
			`RagService.ingestWithChunking(`
			`text = DynamicNewsScraper.fetchFullContent(item.originallink),`
			`newsLink = item.originallink,`
			`pubDate = item.pubDate,`
			`stockCode = corpInfo.stockCode,`
			`corpName = corpInfo.cName,`
			`corpCode = corpInfo.cCode,`
			`stcokName = corpInfo.stockName`
			`)`
			`}`
			`println("📰 '${query}' 관련 뉴스 새로운 학습 데이터 게더링")`
			`} else {`
			`println("📰 '${query}' 관련 뉴스 기 학습 데이터 스킵")`
			`}`
			`}`
			`}.awaitAll()`
			`println("$query 관련 뉴스 ${urls.size}개 학습 완료")`
			`}`
			`}`