|
@@ -1,38 +1,72 @@
|
|
|
<template>
|
|
|
- <ContentWrap>
|
|
|
- <el-form
|
|
|
- class="-mb-15px"
|
|
|
- :model="queryParams"
|
|
|
- ref="queryFormRef"
|
|
|
- :inline="true"
|
|
|
- label-width="110px"
|
|
|
- >
|
|
|
- <el-form-item label="微信公众号链接" prop="urls" class="!w-100%">
|
|
|
- <el-input
|
|
|
- v-model="queryParams.urls"
|
|
|
- class="!w-75%"
|
|
|
- type="textarea"
|
|
|
- :rows="2"
|
|
|
- placeholder="请输入需要解析的微信公众号链接"
|
|
|
- />
|
|
|
- <el-button type="primary" class="ml-10px" plain :loading="loading" @click="handleAnalysis">解析</el-button>
|
|
|
- </el-form-item>
|
|
|
- </el-form>
|
|
|
- </ContentWrap>
|
|
|
-
|
|
|
- <el-row v-if="contents.length" :gutter="20">
|
|
|
- <el-col v-for="(content, index) in contents" :key="index" :span="24">
|
|
|
- <el-card class="!h-600px" v-loading="!content.markdown_text">
|
|
|
- <template #header>
|
|
|
- <div class="flex items-center justify-between">
|
|
|
- <el-text class="flex-1" truncated>{{ content.url }}</el-text>
|
|
|
- <el-button type="primary" plain class="mt-10px" :loading="content.itemLoading" @click="handleSubmit(content)">信息提取</el-button>
|
|
|
- </div>
|
|
|
- </template>
|
|
|
- <iframe :id="content.id" class="!w-100% !h-[calc(100vh-90px)]" src="" frameborder="0"></iframe>
|
|
|
- </el-card>
|
|
|
- </el-col>
|
|
|
- </el-row>
|
|
|
+ <div v-if="!markdown_data || !Object.keys(markdown_data)?.length">
|
|
|
+ <ContentWrap>
|
|
|
+ <el-form
|
|
|
+ class="-mb-15px"
|
|
|
+ :model="queryParams"
|
|
|
+ ref="queryFormRef"
|
|
|
+ :inline="true"
|
|
|
+ label-width="110px"
|
|
|
+ >
|
|
|
+ <el-form-item label="微信公众号链接" prop="urls" class="!w-100%">
|
|
|
+ <el-input
|
|
|
+ v-model="queryParams.urls"
|
|
|
+ class="!w-75%"
|
|
|
+ type="textarea"
|
|
|
+ :rows="2"
|
|
|
+ placeholder="请输入需要解析的微信公众号链接"
|
|
|
+ />
|
|
|
+ <el-button
|
|
|
+ type="primary"
|
|
|
+ class="ml-10px"
|
|
|
+ plain
|
|
|
+ :loading="loading"
|
|
|
+ @click="handleAnalysis"
|
|
|
+ >解析</el-button>
|
|
|
+ </el-form-item>
|
|
|
+ </el-form>
|
|
|
+ </ContentWrap>
|
|
|
+
|
|
|
+ <el-row v-if="contents.length" :gutter="20">
|
|
|
+ <el-col v-for="(content, index) in contents" :key="index" :span="24">
|
|
|
+ <el-card class="!h-600px" v-loading="!content.data">
|
|
|
+ <template #header>
|
|
|
+ <div class="flex items-center justify-between">
|
|
|
+ <el-text class="flex-1" truncated>{{ content.url }}</el-text>
|
|
|
+ <el-button
|
|
|
+ type="primary"
|
|
|
+ plain
|
|
|
+ class="mt-10px"
|
|
|
+ :loading="content.itemLoading"
|
|
|
+ @click="handleSubmit(content, index)"
|
|
|
+ >信息提取</el-button>
|
|
|
+ </div>
|
|
|
+ </template>
|
|
|
+ <iframe
|
|
|
+ :id="content.id"
|
|
|
+ class="!w-100% !h-[calc(100vh-90px)]"
|
|
|
+ src=""
|
|
|
+ frameborder="0"
|
|
|
+ ></iframe>
|
|
|
+ </el-card>
|
|
|
+ </el-col>
|
|
|
+ </el-row>
|
|
|
+ </div>
|
|
|
+ <div v-else class="!h-100%">
|
|
|
+ <div class="text-right">
|
|
|
+ <el-button type="primary" plain @click="handleReturn">返回查看解析内容</el-button>
|
|
|
+ </div>
|
|
|
+ <el-card class="!h-100% mt-10px">
|
|
|
+ <div class="!w-192px !h-250px m-auto">
|
|
|
+ <el-image referrerpolicy="no-referrer" :src="markdown_data.pic_url" class="!w-192px !h-250px" />
|
|
|
+ </div>
|
|
|
+ <pre>{{ markdown_data.name_zh }}</pre>
|
|
|
+ <pre>{{ markdown_data.name_en }}</pre>
|
|
|
+ <pre>{{ markdown_data.hotel_zh }}</pre>
|
|
|
+ <pre>{{ markdown_data.title_zh }}</pre>
|
|
|
+ <pre>{{ markdown_data.detailIntroduction }}</pre>
|
|
|
+ </el-card>
|
|
|
+ </div>
|
|
|
</template>
|
|
|
|
|
|
<script setup>
|
|
@@ -43,13 +77,22 @@ import TurndownService from 'turndown'
|
|
|
import { talentWebParsingApi } from '@/api/menduner/system/talentMap/webParsing.ts'
|
|
|
import { generateUUID } from '@/utils'
|
|
|
|
|
|
-const emit = defineEmits(['analysis'])
|
|
|
+const emit = defineEmits(['analysis', 'reset'])
|
|
|
+const props = defineProps({
|
|
|
+ markDownData: Object
|
|
|
+})
|
|
|
const message = useMessage() // 消息弹窗
|
|
|
const { t } = useI18n() // 国际化
|
|
|
|
|
|
+const markdown_data = ref({})
|
|
|
+watch(() => props.markDownData, val => {
|
|
|
+ markdown_data.value = val
|
|
|
+}, { deep: true })
|
|
|
+
|
|
|
const loading = ref(false)
|
|
|
const queryParams = reactive({
|
|
|
- urls: ''
|
|
|
+ // urls: 'https://mp.weixin.qq.com/s/JZ5qxaj9vXsEsswxxD1djA'
|
|
|
+ urls: 'https://mp.weixin.qq.com/s/vQLWlSB6DzqSewtBLkk_kQ'
|
|
|
})
|
|
|
const queryFormRef = ref()
|
|
|
const contents = ref([])
|
|
@@ -77,20 +120,20 @@ const wechatHtmlToMarkdown = (html) => {
|
|
|
// 创建一个临时DOM解析器
|
|
|
const parser = new DOMParser()
|
|
|
const doc = parser.parseFromString(html, 'text/html')
|
|
|
-
|
|
|
+
|
|
|
// 提取正文内容 - 微信公众号文章通常在id="js_content"的div中
|
|
|
const content = doc.querySelector('#js_content') || doc.body
|
|
|
-
|
|
|
+
|
|
|
// 移除不需要的元素
|
|
|
const elementsToRemove = [
|
|
|
- 'script', 'style', 'iframe', 'button',
|
|
|
+ 'script', 'style', 'iframe', 'button',
|
|
|
'.qr_code', '.rich_media_extra', '.copyright'
|
|
|
]
|
|
|
-
|
|
|
+
|
|
|
elementsToRemove.forEach(selector => {
|
|
|
content.querySelectorAll(selector).forEach(el => el.remove())
|
|
|
})
|
|
|
-
|
|
|
+
|
|
|
// 转换为Markdown
|
|
|
return turndownService.turndown(content.innerHTML)
|
|
|
}
|
|
@@ -103,14 +146,49 @@ const handleConvert = (res) => {
|
|
|
return result
|
|
|
}
|
|
|
|
|
|
+function extractPublishTime(doc, html) {
|
|
|
+ // 1. 通过 id
|
|
|
+ let timeEl = doc.getElementById('publish_time')
|
|
|
+ if (timeEl && timeEl.innerText) return timeEl.innerText.trim()
|
|
|
+
|
|
|
+ // 2. 通过 class
|
|
|
+ let metaEls = doc.querySelectorAll('.rich_media_meta.rich_media_meta_text')
|
|
|
+ for (let el of metaEls) {
|
|
|
+ if (el.innerText && /\d{4}年\d{1,2}月\d{1,2}日/.test(el.innerText)) {
|
|
|
+ return el.innerText.trim()
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // 3. 通过 meta 标签
|
|
|
+ let meta = doc.querySelector('meta[property="article:published_time"]')
|
|
|
+ if (meta && meta.content) return meta.content.trim()
|
|
|
+
|
|
|
+ // 4. 通过正则从 html 里提取
|
|
|
+ let match = html.match(/(\d{4}年\d{1,2}月\d{1,2}日)/)
|
|
|
+ if (match) return match[1]
|
|
|
+
|
|
|
+ return ''
|
|
|
+}
|
|
|
+
|
|
|
+function tryExtractPublishTime(doc, html, cb, maxTry = 10, interval = 200) {
|
|
|
+ let tryCount = 0
|
|
|
+ const timer = setInterval(() => {
|
|
|
+ const publishTime = extractPublishTime(doc, html)
|
|
|
+ if (publishTime || tryCount >= maxTry) {
|
|
|
+ clearInterval(timer)
|
|
|
+ cb(publishTime)
|
|
|
+ }
|
|
|
+ tryCount++
|
|
|
+ }, interval)
|
|
|
+}
|
|
|
+
|
|
|
// 查看原网页
|
|
|
const showPage = (res) => {
|
|
|
if (res.data) {
|
|
|
let html = res.data
|
|
|
html = html.replace(/data-src/g, 'src') // 将 data-src 转化为 src
|
|
|
- .replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/g, '') // 移除HTML内容中所有的<script>标签,这样可以避免在iframe中执行潜在的不受信任的脚本。
|
|
|
+ // .replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/g, '') // 移除HTML内容中所有的<script>标签,这样可以避免在iframe中执行潜在的不受信任的脚本。
|
|
|
.replace(/https/g, 'http') // 将HTML内容中所有的https替换为http,可能是为了避免在HTTPS环境下加载非HTTPS资源导致浏览器警告
|
|
|
-
|
|
|
nextTick(() => {
|
|
|
const iframe = document.getElementById(res.id)
|
|
|
if (!iframe) return
|
|
@@ -121,25 +199,43 @@ const showPage = (res) => {
|
|
|
doc.open()
|
|
|
doc.write(html_src_add)
|
|
|
doc.close()
|
|
|
+
|
|
|
// 通过延时获取文档高度赋值Iframe去除滚动条,根据实际情况增加延时时间
|
|
|
setTimeout(() => {
|
|
|
const jsContent = doc.getElementById('js_content')
|
|
|
- if (jsContent) {
|
|
|
+ if (jsContent) {
|
|
|
jsContent.style.visibility = 'visible'
|
|
|
jsContent.style.opacity = 1
|
|
|
}
|
|
|
}, 100)
|
|
|
+
|
|
|
+ // 获取发布时间
|
|
|
+ setTimeout(() => {
|
|
|
+ tryExtractPublishTime(doc, html, (publishTime) => {
|
|
|
+ if (publishTime) {
|
|
|
+ res.publish_time = publishTime
|
|
|
+ .replace("年", "-")
|
|
|
+ .replace("月", "-")
|
|
|
+ .replace("日", "")
|
|
|
+ .split(" ")[0];
|
|
|
+ console.log(publishTime, '发布时间', res.publish_time)
|
|
|
+ }
|
|
|
+ });
|
|
|
+ }, 100); // 先等100ms让iframe初步渲染,再开始轮询
|
|
|
})
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-// 提取
|
|
|
+// 解析
|
|
|
const handleAnalysis = async () => {
|
|
|
if (!queryParams.urls) return
|
|
|
|
|
|
+ // 重置右侧标签及表单
|
|
|
+ emit('reset')
|
|
|
+
|
|
|
if (contents.value && contents.value.length) {
|
|
|
- const isAnalysis = contents.value.every(e => e.itemLoading)
|
|
|
- return message.warning('正在解析中,请稍后再试')
|
|
|
+ const isAnalysis = contents.value.some(e => e.itemLoading)
|
|
|
+ if (isAnalysis) return message.warning('正在提取信息中,请稍后再试')
|
|
|
}
|
|
|
|
|
|
contents.value = []
|
|
@@ -153,6 +249,8 @@ const handleAnalysis = async () => {
|
|
|
return
|
|
|
}
|
|
|
|
|
|
+ if (urlArr.length > 1) return message.warning('只支持单个链接解析')
|
|
|
+
|
|
|
const base_url = import.meta.env.VITE_NODE_BASE_URL
|
|
|
axios.post(`${base_url}/process-urls`, { urlArr }, { timeout: 60000 }).then(res => {
|
|
|
if (!res?.data || !res?.data?.contents || !res?.data?.contents.length) return
|
|
@@ -160,6 +258,7 @@ const handleAnalysis = async () => {
|
|
|
list.forEach(e => {
|
|
|
contents.value.push({
|
|
|
...e,
|
|
|
+ publish_time: null,
|
|
|
itemLoading: false,
|
|
|
id: generateUUID(),
|
|
|
markdown_text: handleConvert(e)
|
|
@@ -177,20 +276,34 @@ const handleAnalysis = async () => {
|
|
|
})
|
|
|
}
|
|
|
|
|
|
-// 解析
|
|
|
-const handleSubmit = async (content) => {
|
|
|
- if (loading.value) return message.warning('正在提取中,请稍后再试')
|
|
|
+// 信息提取
|
|
|
+const extractIndex = ref(0)
|
|
|
+const handleSubmit = async (content, index) => {
|
|
|
+ extractIndex.value = index
|
|
|
+ if (loading.value) return message.warning('正在解析中,请稍后再试')
|
|
|
if (!content.markdown_text) return
|
|
|
|
|
|
content.itemLoading = true
|
|
|
+ const { markdown_text, publish_time } = content
|
|
|
+ if (!publish_time) {
|
|
|
+ message.warning('发布时间不能为空')
|
|
|
+ content.itemLoading = false
|
|
|
+ return
|
|
|
+ }
|
|
|
try {
|
|
|
- const data = await talentWebParsingApi.saveMarkdownContent({ markdown_text: content.markdown_text })
|
|
|
- emit('analysis', data ?? {})
|
|
|
+ const data = await talentWebParsingApi.saveMarkdownContent({ markdown_text, publish_time })
|
|
|
+ emit('analysis', data ?? [])
|
|
|
message.success('信息提取成功')
|
|
|
} finally {
|
|
|
content.itemLoading = false
|
|
|
}
|
|
|
}
|
|
|
+
|
|
|
+// 返回查看解析内容
|
|
|
+const handleReturn = () => {
|
|
|
+ markdown_data.value = {}
|
|
|
+ showPage(contents.value[extractIndex.value])
|
|
|
+}
|
|
|
</script>
|
|
|
|
|
|
<style scoped>
|