zhengnaiwen_citu
/
menduner-admin


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310
							<template>
  <div v-if="!markdown_data || !Object.keys(markdown_data)?.length">
    <ContentWrap>
      <el-form
        class="-mb-15px"
        :model="queryParams"
        ref="queryFormRef"
        :inline="true"
        label-width="110px"
      >
        <el-form-item label="微信公众号链接" prop="urls" class="!w-100%">
          <el-input
            v-model="queryParams.urls"
            class="!w-75%"
            type="textarea"
            :rows="2"
            placeholder="请输入需要解析的微信公众号链接"
          />
          <el-button
						type="primary"
						class="ml-10px"
						plain
						:loading="loading"
						@click="handleAnalysis"
					>解析</el-button>
        </el-form-item>
      </el-form>
    </ContentWrap>

    <el-row v-if="contents.length" :gutter="20">
      <el-col v-for="(content, index) in contents" :key="index" :span="24">
        <el-card class="!h-600px" v-loading="!content.data">
          <template #header>
            <div class="flex items-center justify-between">
              <el-text class="flex-1" truncated>{{ content.url }}</el-text>
              <el-button
                type="primary"
                plain
                class="mt-10px"
                :loading="content.itemLoading"
                @click="handleSubmit(content, index)"
              >信息提取</el-button>
            </div>
          </template>
          <iframe
            :id="content.id"
            class="!w-100% !h-[calc(100vh-90px)]"
            src=""
            frameborder="0"
          ></iframe>
        </el-card>
      </el-col>
    </el-row>
  </div>
  <div v-else class="!h-100%">
    <div class="text-right">
      <el-button type="primary" plain @click="handleReturn">返回查看解析内容</el-button>
    </div>
    <el-card class="!h-100% mt-10px">
      <div class="!w-192px !h-250px m-auto">
				<el-image referrerpolicy="no-referrer" :src="markdown_data.pic_url" class="!w-192px !h-250px" />
			</div>
			<pre>{{ markdown_data.name_zh }}</pre>
			<pre>{{ markdown_data.name_en }}</pre>
			<pre>{{ markdown_data.hotel_zh }}</pre>
			<pre>{{ markdown_data.title_zh }}</pre>
			<pre>{{ markdown_data.detailIntroduction }}</pre>
    </el-card>
  </div>
</template>

<script setup>
/** 人才采集 网页解析 */
defineOptions({ name: 'WebPageParsing' })
import axios from 'axios'
import TurndownService from 'turndown'
import { talentWebParsingApi } from '@/api/menduner/system/talentMap/webParsing.ts'
import { generateUUID } from '@/utils'

const emit = defineEmits(['analysis', 'reset'])
const props = defineProps({
	markDownData: Object
})
const message = useMessage() // 消息弹窗
const { t } = useI18n() // 国际化

const markdown_data = ref({})
watch(() => props.markDownData, val => {
	markdown_data.value = val
}, { deep: true })

const loading = ref(false)
const queryParams = reactive({
	// urls: 'https://mp.weixin.qq.com/s/JZ5qxaj9vXsEsswxxD1djA'
	urls: 'https://mp.weixin.qq.com/s/vQLWlSB6DzqSewtBLkk_kQ'
})
const queryFormRef = ref()
const contents = ref([])
const drawer = ref(false)

// 创建转换服务
const turndownService = new TurndownService({
  headingStyle: 'atx',
  codeBlockStyle: 'fenced',
  bulletListMarker: '-'
})

// 添加自定义规则处理微信公众号特有内容
turndownService.addRule('wechatImages', {
  filter: 'img',
  replacement: (content, node) => {
    const alt = node.getAttribute('alt') || ''
    const src = node.getAttribute('src') || node.getAttribute('data-src') || ''
    return `![${alt}](${src})`
  }
})

// 提取主要内容并转换为Markdown
const wechatHtmlToMarkdown = (html) => {
  // 创建一个临时DOM解析器
  const parser = new DOMParser()
  const doc = parser.parseFromString(html, 'text/html')

  // 提取正文内容 - 微信公众号文章通常在id="js_content"的div中
  const content = doc.querySelector('#js_content') || doc.body

  // 移除不需要的元素
  const elementsToRemove = [
    'script', 'style', 'iframe', 'button',
    '.qr_code', '.rich_media_extra', '.copyright'
  ]

  elementsToRemove.forEach(selector => {
    content.querySelectorAll(selector).forEach(el => el.remove())
  })

  // 转换为Markdown
  return turndownService.turndown(content.innerHTML)
}

// 转换为markdown格式
const handleConvert = (res) => {
	if (!res.data) return
	const result = wechatHtmlToMarkdown(res.data)
	if (!result) return message.warning('转换失败')
	return result
}

function extractPublishTime(doc, html) {
  // 1. 通过 id
  let timeEl = doc.getElementById('publish_time')
  if (timeEl && timeEl.innerText) return timeEl.innerText.trim()

  // 2. 通过 class
  let metaEls = doc.querySelectorAll('.rich_media_meta.rich_media_meta_text')
  for (let el of metaEls) {
    if (el.innerText && /\d{4}年\d{1,2}月\d{1,2}日/.test(el.innerText)) {
      return el.innerText.trim()
    }
  }

  // 3. 通过 meta 标签
  let meta = doc.querySelector('meta[property="article:published_time"]')
  if (meta && meta.content) return meta.content.trim()

  // 4. 通过正则从 html 里提取
  let match = html.match(/(\d{4}年\d{1,2}月\d{1,2}日)/)
  if (match) return match[1]

  return ''
}

function tryExtractPublishTime(doc, html, cb, maxTry = 10, interval = 200) {
  let tryCount = 0
  const timer = setInterval(() => {
    const publishTime = extractPublishTime(doc, html)
    if (publishTime || tryCount >= maxTry) {
      clearInterval(timer)
      cb(publishTime)
    }
    tryCount++
  }, interval)
}

// 查看原网页
const showPage = (res) => {
  if (res.data) {
    let html = res.data
    html = html.replace(/data-src/g, 'src') // 将 data-src 转化为 src
      // .replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/g, '') // 移除HTML内容中所有的<script>标签，这样可以避免在iframe中执行潜在的不受信任的脚本。
      .replace(/https/g, 'http') // 将HTML内容中所有的https替换为http，可能是为了避免在HTTPS环境下加载非HTTPS资源导致浏览器警告
    nextTick(() => {
      const iframe = document.getElementById(res.id)
      if (!iframe) return
      const doc = iframe.contentDocument || iframe.document
      // 设置 iframe 中请求不发送 referrer，以绕过图片防盗链
      const htmlArr = html.split('</head>')
      const html_src_add = htmlArr[0] + '<meta name="referrer" content="never"></head>' + htmlArr[1]
      doc.open()
      doc.write(html_src_add)
      doc.close()

      // 通过延时获取文档高度赋值Iframe去除滚动条，根据实际情况增加延时时间
      setTimeout(() => {
        const jsContent = doc.getElementById('js_content')
				if (jsContent) {
          jsContent.style.visibility = 'visible'
          jsContent.style.opacity = 1
        }
      }, 100)

      // 获取发布时间
      setTimeout(() => {
        tryExtractPublishTime(doc, html, (publishTime) => {
          if (publishTime) {
            res.publish_time = publishTime
              .replace("年", "-")
              .replace("月", "-")
              .replace("日", "")
              .split(" ")[0];
            console.log(publishTime, '发布时间', res.publish_time)
          }
        });
      }, 100); // 先等100ms让iframe初步渲染，再开始轮询
    })
  }
}

// 解析
const handleAnalysis = async () => {
	if (!queryParams.urls) return

  // 重置右侧标签及表单
  emit('reset')

	if (contents.value && contents.value.length) {
		const isAnalysis = contents.value.some(e => e.itemLoading)
		if (isAnalysis) return message.warning('正在提取信息中，请稍后再试')
	}

	contents.value = []
	loading.value = true

	const urlArr = queryParams.urls.split(',').map(url => url.trim()).filter(Boolean)

	const isWeChatUrl = urlArr.every(url => url.includes('https://mp.weixin.qq.com'))
	if (!isWeChatUrl) {
		message.warning('请输入微信公众文章链接')
		return
	}

	if (urlArr.length > 1) return message.warning('只支持单个链接解析')

	const base_url = import.meta.env.VITE_NODE_BASE_URL
	axios.post(`${base_url}/process-urls`, { urlArr }, { timeout: 60000 }).then(res => {
		if (!res?.data || !res?.data?.contents || !res?.data?.contents.length) return
		const list = res?.data?.contents
		list.forEach(e => {
			contents.value.push({
				...e,
				publish_time: null,
				itemLoading: false,
				id: generateUUID(),
				markdown_text: handleConvert(e)
			})
		})
		contents.value.forEach(e => {
			showPage(e)
		})

	}).catch(err => {
		console.log(err, 'error');
		message.error(err.message)
	}).finally(_ => {
		loading.value = false
	})
}

// 信息提取
const extractIndex = ref(0)
const handleSubmit = async (content, index) => {
	extractIndex.value = index
	if (loading.value) return message.warning('正在解析中，请稍后再试')
	if (!content.markdown_text) return

	content.itemLoading = true
  const { markdown_text, publish_time } = content
	if (!publish_time) {
		message.warning('发布时间不能为空')
		content.itemLoading = false
		return
	}
	try {
		const data = await talentWebParsingApi.saveMarkdownContent({ markdown_text, publish_time })
		emit('analysis', data ?? [])
		message.success('信息提取成功')
	} finally {
		content.itemLoading = false
	}
}

// 返回查看解析内容
const handleReturn = () => {
	markdown_data.value =  {}
	showPage(contents.value[extractIndex.value])
}
</script>

<style scoped>
</style>