From b191022e6d2166101268ed2fcbc627488f9b833e Mon Sep 17 00:00:00 2001 From: lunaticbum Date: Fri, 7 Mar 2025 18:33:09 +0900 Subject: [PATCH] .... --- .../lunaticbum/back/lun/configs/AppConfig.kt | 1 + .../back/lun/controllers/Telegram.kt | 2 +- .../kr/lunaticbum/back/lun/model/SearXng.kt | 3 +- .../kr/lunaticbum/back/lun/service/Lama.kt | 126 ++++++++++++++---- 4 files changed, 101 insertions(+), 31 deletions(-) diff --git a/src/main/kotlin/kr/lunaticbum/back/lun/configs/AppConfig.kt b/src/main/kotlin/kr/lunaticbum/back/lun/configs/AppConfig.kt index 8357b66..70d88be 100644 --- a/src/main/kotlin/kr/lunaticbum/back/lun/configs/AppConfig.kt +++ b/src/main/kotlin/kr/lunaticbum/back/lun/configs/AppConfig.kt @@ -39,6 +39,7 @@ class AppConfig : WebMvcConfigurer { @Bean fun chatClient(): OllamaApi { return OllamaApi("https://lama.lunaticbum.kr") + // .withDefaultOptions( // OllamaOptions.create() // .withModel("phi4:14b") diff --git a/src/main/kotlin/kr/lunaticbum/back/lun/controllers/Telegram.kt b/src/main/kotlin/kr/lunaticbum/back/lun/controllers/Telegram.kt index 84cc1be..3fd0252 100644 --- a/src/main/kotlin/kr/lunaticbum/back/lun/controllers/Telegram.kt +++ b/src/main/kotlin/kr/lunaticbum/back/lun/controllers/Telegram.kt @@ -410,7 +410,7 @@ class Telegram { // } // } CoroutineScope(Dispatchers.IO).async { - lama.generateResponse(query = originalQuery) + lama.generateResponse(query = originalQuery?.replace("오늘", SimpleDateFormat("yyyMMdd").format(Date()))) } return "TEST" } diff --git a/src/main/kotlin/kr/lunaticbum/back/lun/model/SearXng.kt b/src/main/kotlin/kr/lunaticbum/back/lun/model/SearXng.kt index c83814b..6bb724e 100644 --- a/src/main/kotlin/kr/lunaticbum/back/lun/model/SearXng.kt +++ b/src/main/kotlin/kr/lunaticbum/back/lun/model/SearXng.kt @@ -12,7 +12,7 @@ class SearXng { var unresponsive_engines: ArrayList>? = null } class SearXngResult { - var originQuery : String? = null +// var originQuery : String? = null var url: String? = null var title: String? = null var content: String? = null @@ -25,4 +25,5 @@ class SearXngResult { var score: Double = 0.0 var category: String? = null var pageData : String? = null + var originHtml : String? = null } diff --git a/src/main/kotlin/kr/lunaticbum/back/lun/service/Lama.kt b/src/main/kotlin/kr/lunaticbum/back/lun/service/Lama.kt index d8c1bc6..802aec6 100644 --- a/src/main/kotlin/kr/lunaticbum/back/lun/service/Lama.kt +++ b/src/main/kotlin/kr/lunaticbum/back/lun/service/Lama.kt @@ -3,17 +3,16 @@ package kr.lunaticbum.back.lun.service import com.google.gson.Gson -import com.knuddels.jtokkit.api.IntArrayList +import com.google.gson.annotations.SerializedName import io.micrometer.observation.ObservationRegistry import kotlinx.coroutines.CoroutineScope import kotlinx.coroutines.Dispatchers import kotlinx.coroutines.launch import kr.lunaticbum.back.lun.configs.GlobalEnvironment -import kr.lunaticbum.back.lun.controllers.BumlamaResp import kr.lunaticbum.back.lun.controllers.TelegramSendMsg -import kr.lunaticbum.back.lun.controllers.lamaGenerated import kr.lunaticbum.back.lun.model.* import org.jsoup.Jsoup +import org.jsoup.select.Elements import org.springframework.ai.embedding.EmbeddingRequest import org.springframework.ai.ollama.OllamaEmbeddingModel import org.springframework.ai.ollama.api.OllamaApi @@ -25,11 +24,10 @@ import org.springframework.http.MediaType import org.springframework.stereotype.Service import org.springframework.web.reactive.function.BodyInserters import org.springframework.web.reactive.function.client.WebClient -import java.net.URLEncoder +import reactor.kotlin.core.publisher.toMono import java.text.SimpleDateFormat import java.time.Duration import java.util.* -import kotlin.collections.ArrayList @Service @@ -74,6 +72,71 @@ class Lama { .retrieve() .bodyToMono(QCollection::class.java).timeout(Duration.ofMinutes(20L)).block()?.result?.points_count ?: 0L } + + fun jsopFilter(url : String) : String { + val joinString = "\n#" + var lastElement : Elements = Elements() + var body = Jsoup.connect(url).timeout(30000).get().body() + var elements : Elements? = null + if (url.contains("nate.com", true)) { + if (url.contains("view", true)) { + elements = body.select("[class*=articleView]") + }else { + elements = body.select("[class*=postRankSubjectList]") + } + } else if (url.contains("newsis.com/view", true)) { + elements = body.select("[class*=articleView]") + } else if (url.contains("blog.naver.com", true)) { + elements = body.select("[class*=se-viewer]") + } else if (url.contains("bbc.com/korean/articles", true)) { + elements = body.select("main[role$=main]") + } else if (url.contains("chosun.com/client", true)) { + elements = body.select("[class*=articleBody]") + } else if (url.contains("nocutnews.co.kr/news", true)) { + elements = body.select("[class*=container]") + } else if (url.contains("hani.co.kr/arti/", true)) { + elements = body.select("[class*=ArticleDetail]") + } else if (url.contains("yna.co.kr/view", true)) { + elements = body.select("[class*=container]") + } else if (url.contains("newspim.com/news", true)) { + elements = body.select("[class*=container]") + } else { + + } + if (elements?.size ?: 0 > 0) { + elements?.forEach { + lastElement.add(it) + } + } + + if (lastElement.size < 1) { + arrayOf("container","article","main","viewer","content").forEach { + var result = Elements() + result.addAll(body.select("[class*=$it]")) + result.addAll(body.select("[id*=$it]")) + result.addAll(body.select(it)) + result.forEach { if (it.text().length > 100 && it.children().size < 5) { lastElement.add(it) } } + } + } + return if (lastElement.size > 0) { + lastElement.map { it.children().eachText() }.joinToString(joinString) + } else { + body.children().map { it.children().eachText() }.joinToString(joinString) + } + } + +// class WebScrap { +// @SerializedName("query", alternate = ["question"]) +// var query: String? = null +// var original_html: String? = null +// var original_content: String? = null +// var summary: String? = null +// var keywords: ArrayList? = null +// var related_links: ArrayList? = null +// var relatedness_score: Double = 0.0 +// } + + private fun addDocuments(query : String) { val embeddingModel = OllamaEmbeddingModel( @@ -82,45 +145,49 @@ class Lama { ObservationRegistry.create(), ModelManagementOptions.defaults() ) - val gSearch = "https://psn.lunaticbum.kr/search?q=${query?.replace("오늘", SimpleDateFormat("yyyMMdd").format(Date()))}&language=auto&time_range=month&safesearch=0&categories=general&format=json" + val gSearch = "https://psn.lunaticbum.kr/search?q=${query?.replace("오늘", SimpleDateFormat("yyyMMdd").format(Date()))}&language=ko&time_range=month&safesearch=0&categories=general&format=json" println("gSearch >>> ${gSearch}") val sdss = QPut(arrayListOf()) WebClient.create().get() .uri(gSearch) .retrieve() .bodyToMono(SearXng::class.java).timeout(Duration.ofMinutes(20L)).block()?.let { gsResult -> - gsResult.results?.filter { it.score > 0.5}?.forEach { + gsResult.results?.filter { it.score > 0.3}?.forEach { qPointsCount += 1 - println("in filter") - it.originQuery = query + println("in filter ${it.url}") +// it.originQuery = query val data = Gson().toJson(it) println(it.title) - Jsoup.connect(it.url).get().html().let { text -> + jsopFilter(it.url!!).let { text -> try { - println("text >>>>> $text") - it.pageData = chatClient.chat(OllamaApi.ChatRequest.Builder("phi4:14b").stream(false).format("json").messages( - listOf(OllamaApi.Message.Builder(OllamaApi.Message.Role.USER).content("'${text}' 웹 페이지 모든 내욜을 복사 한건데 본문 내용만 정리해줘").build()) - ).build()).message.content - println("summary result >>>>> ${it.pageData}") - val embeddingResponse = embeddingModel.call( - EmbeddingRequest( - listOf(data), - OllamaOptions.builder() - .model("nomic-embed-text") - .truncate(false) - .build() + println("text >>>>> ${text?.chunked(50)?.first() ?: ""}") + var dispoable = chatClient.chat(OllamaApi.ChatRequest.Builder("phi4:14b").stream(false).format("json").messages( + listOf(OllamaApi.Message.Builder(OllamaApi.Message.Role.USER).content("원문:\n'${text}'\n원문의 웹 페이지 소스는 '$query'이 질문에 대해 연관 결과로 받은 내용이야. 해당 정보를 파악해서 'query:{질문},contents:{본문내용},summary:{요약},keywords:[키워드],related_links:[링크],relatedness_score:{0.0~10.0}'이 형식의 결과만들어줘 내용은 한국어로 부탁할께").build()) + ).build()).toMono().subscribe({aiResponce -> + it.pageData = aiResponce.message.content +// println(aiResponce) + println("summary result >>>>> ${it.pageData}") +// it.originHtml = text + val embeddingResponse = embeddingModel.call( + EmbeddingRequest( + listOf(data), + OllamaOptions.builder() + .model("nomic-embed-text") + .truncate(false).build() + ) ) - ) - sdss.points.add(QData(id = qPointsCount,embeddingResponse.result.output,it)) + sdss.points.add(QData(id = qPointsCount,embeddingResponse.result.output,it)) + },{err-> + err.printStackTrace() + }) }catch (e : Exception) { - + e.printStackTrace() } } } } println("out filter") if (sdss.points.size > 0) { - println("sdss.points.size ${sdss.points.size} ${Gson().toJson(sdss)}") val qUrl = "https://ollama.lunaticbum.kr/collections/blama_vectors".plus("/points") val client = WebClient.create() client.put() @@ -140,18 +207,19 @@ class Lama { var lists = client.post() .uri(qUrl) .header("api-key","blama-admin-key-gb") - .body(BodyInserters.fromValue(Gson().toJson(QSearchData(embedFlots,5)))) + .body(BodyInserters.fromValue(Gson().toJson(QSearchData(embedFlots,3)))) .retrieve() .bodyToMono(QSearch::class.java).timeout(Duration.ofMinutes(20L)).block() + println(Gson().toJson(lists)) return if (lists?.result?.size ?: 0 > 0) { val qContents = QContentsList() - lists?.result?.forEach { + qContents.ids.add(it.id) } val qCUrl = "https://ollama.lunaticbum.kr/collections/blama_vectors".plus("/points") val client2 = WebClient.create() - client.post() + client2.post() .uri(qCUrl) .header("api-key", "blama-admin-key-gb") .body(BodyInserters.fromValue(Gson().toJson(qContents)))