123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282 |
- <template>
- <ContentWrap>
- <el-form
- class="-mb-15px"
- :model="queryParams"
- ref="queryFormRef"
- :inline="true"
- label-width="110px"
- >
- <el-form-item label="微信公众号链接" prop="urls" class="!w-100%">
- <el-input
- v-model="queryParams.urls"
- class="!w-75%"
- type="textarea"
- :rows="2"
- placeholder="请输入需要解析的微信公众号链接"
- />
- <el-button
- type="primary"
- class="ml-10px"
- plain
- @click="handleAnalysis"
- >解析</el-button>
- </el-form-item>
- </el-form>
- </ContentWrap>
- <el-row v-if="contents.length" :gutter="20">
- <el-col v-for="(content, index) in contents" :key="index" :span="24">
- <el-card class="!h-600px" v-loading="!content.data">
- <template #header>
- <div class="flex items-center justify-between">
- <el-text class="flex-1" truncated>{{ content.url }}</el-text>
- <el-button
- type="primary"
- plain
- class="mt-10px"
- @click="handleSubmit(content, index)"
- >信息提取</el-button>
- </div>
- </template>
- <iframe
- :id="content.id"
- class="!w-100% !h-[calc(100vh-90px)]"
- src=""
- frameborder="0"
- ></iframe>
- </el-card>
- </el-col>
- </el-row>
- </template>
- <script setup>
- /** 人才采集 网页解析 */
- defineOptions({ name: 'WebPageParsing' })
- import axios from 'axios'
- import TurndownService from 'turndown'
- import { talentWebParsingApi } from '@/api/menduner/system/talentMap/webParsing.ts'
- import { generateUUID } from '@/utils'
- import { ElLoading } from 'element-plus'
- const emit = defineEmits(['analysis', 'reset'])
- const message = useMessage() // 消息弹窗
- const { t } = useI18n() // 国际化
- const queryParams = reactive({
- // urls: 'https://mp.weixin.qq.com/s/JZ5qxaj9vXsEsswxxD1djA' // https://mp.weixin.qq.com/s/R1aJpn9z-Jf0dk9ttoYYeg
- // urls: 'https://mp.weixin.qq.com/s/vQLWlSB6DzqSewtBLkk_kQ'
- urls: ''
- })
- const queryFormRef = ref()
- const contents = ref([])
- const drawer = ref(false)
- // 创建转换服务
- const turndownService = new TurndownService({
- headingStyle: 'atx',
- codeBlockStyle: 'fenced',
- bulletListMarker: '-'
- })
- // 添加自定义规则处理微信公众号特有内容
- turndownService.addRule('wechatImages', {
- filter: 'img',
- replacement: (content, node) => {
- const alt = node.getAttribute('alt') || ''
- const src = node.getAttribute('src') || node.getAttribute('data-src') || ''
- return ``
- }
- })
- // 提取主要内容并转换为Markdown
- const wechatHtmlToMarkdown = (html) => {
- // 创建一个临时DOM解析器
- const parser = new DOMParser()
- const doc = parser.parseFromString(html, 'text/html')
- // 提取正文内容 - 微信公众号文章通常在id="js_content"的div中
- const content = doc.querySelector('#img_content') || doc.body
- // 移除不需要的元素
- const elementsToRemove = [
- 'script', 'style', 'iframe', 'button',
- '.qr_code', '.rich_media_extra', '.copyright'
- ]
- elementsToRemove.forEach(selector => {
- content.querySelectorAll(selector).forEach(el => el.remove())
- })
- // 转换为Markdown
- return turndownService.turndown(content.innerHTML)
- }
- // 转换为markdown格式
- const handleConvert = (res) => {
- if (!res.data) return
- const result = wechatHtmlToMarkdown(res.data)
- if (!result) return message.warning('转换失败')
- return result
- }
- function extractPublishTime(doc, html) {
- // 1. 通过 id
- let timeEl = doc.getElementById('publish_time')
- if (timeEl && timeEl.innerText) return timeEl.innerText.trim()
- // 2. 通过 class
- let metaEls = doc.querySelectorAll('.rich_media_meta.rich_media_meta_text')
- for (let el of metaEls) {
- if (el.innerText && /\d{4}年\d{1,2}月\d{1,2}日/.test(el.innerText)) {
- return el.innerText.trim()
- }
- }
- // 3. 通过 meta 标签
- let meta = doc.querySelector('meta[property="article:published_time"]')
- if (meta && meta.content) return meta.content.trim()
- // 4. 通过正则从 html 里提取
- let match = html.match(/(\d{4}年\d{1,2}月\d{1,2}日)/)
- if (match) return match[1]
- return ''
- }
- function tryExtractPublishTime(doc, html, cb, maxTry = 10, interval = 200) {
- let tryCount = 0
- const timer = setInterval(() => {
- const publishTime = extractPublishTime(doc, html)
- if (publishTime || tryCount >= maxTry) {
- clearInterval(timer)
- cb(publishTime)
- }
- tryCount++
- }, interval)
- }
- // 查看原网页
- const showPage = (res) => {
- if (res.data) {
- let html = res.data
- html = html.replace(/data-src/g, 'src') // 将 data-src 转化为 src
- // .replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/g, '') // 移除HTML内容中所有的<script>标签,这样可以避免在iframe中执行潜在的不受信任的脚本。
- .replace(/https/g, 'http') // 将HTML内容中所有的https替换为http,可能是为了避免在HTTPS环境下加载非HTTPS资源导致浏览器警告
- nextTick(() => {
- const iframe = document.getElementById(res.id)
- if (!iframe) return
- const doc = iframe.contentDocument || iframe.document
- // 设置 iframe 中请求不发送 referrer,以绕过图片防盗链
- const htmlArr = html.split('</head>')
- const html_src_add = htmlArr[0] + '<meta name="referrer" content="never"></head>' + htmlArr[1]
- doc.open()
- doc.write(html_src_add)
- doc.close()
- // 通过延时获取文档高度赋值Iframe去除滚动条,根据实际情况增加延时时间
- setTimeout(() => {
- const jsContent = doc.getElementById('js_content')
- if (jsContent) {
- jsContent.style.visibility = 'visible'
- jsContent.style.opacity = 1
- }
- }, 100)
- // 获取发布时间
- setTimeout(() => {
- tryExtractPublishTime(doc, html, (publishTime) => {
- if (publishTime) {
- res.publish_time = publishTime
- .replace("年", "-")
- .replace("月", "-")
- .replace("日", "")
- .split(" ")[0];
- console.log(publishTime, '发布时间', res.publish_time)
- }
- });
- }, 100); // 先等100ms让iframe初步渲染,再开始轮询
- })
- }
- }
- // 解析
- const handleAnalysis = async () => {
- if (!queryParams.urls) return
- if (contents.value?.length > 0) {
- await message.confirm('是否确认重新解析?确认后当前内容将会清空!')
- // 重置右侧标签及表单
- emit('reset')
- }
- const loading = ElLoading.service({
- lock: true,
- text: '正在解析中...',
- background: 'rgba(0, 0, 0, 0.7)',
- })
- contents.value = []
- const urlArr = queryParams.urls.split(',').map(url => url.trim()).filter(Boolean)
- const isWeChatUrl = urlArr.every(url => url.includes('https://mp.weixin.qq.com'))
- if (!isWeChatUrl) {
- message.warning('请输入微信公众文章链接')
- return
- }
- if (urlArr.length > 1) return message.warning('只支持单个链接解析')
- const base_url = import.meta.env.VITE_NODE_BASE_URL
- axios.post(`${base_url}/process-urls`, { urlArr }, { timeout: 60000 }).then(res => {
- if (!res?.data || !res?.data?.contents || !res?.data?.contents.length) return
- const list = res?.data?.contents
- list.forEach(e => {
- contents.value.push({
- ...e,
- publish_time: null,
- id: generateUUID(),
- markdown_text: handleConvert(e)
- })
- })
- contents.value.forEach(e => {
- showPage(e)
- })
- }).catch(err => {
- console.log(err, 'error');
- message.error(err.message)
- }).finally(_ => {
- loading.close()
- })
- }
- // 信息提取
- const handleSubmit = async (content, index) => {
- if (!content.markdown_text) return
- const loading = ElLoading.service({
- lock: true,
- text: '信息正在提取中...',
- background: 'rgba(0, 0, 0, 0.7)',
- })
- const { markdown_text, publish_time } = content
- if (!publish_time) {
- message.warning('发布时间不能为空')
- loading.close()
- return
- }
- try {
- const data = await talentWebParsingApi.saveMarkdownContent({ markdown_text, publish_time })
- emit('analysis', data ?? [], markdown_text)
- message.success('信息提取成功')
- } finally {
- loading.close()
- }
- }
- </script>
- <style scoped>
- </style>
|