webAnalysis.vue 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310
  1. <template>
  2. <div v-if="!markdown_data || !Object.keys(markdown_data)?.length">
  3. <ContentWrap>
  4. <el-form
  5. class="-mb-15px"
  6. :model="queryParams"
  7. ref="queryFormRef"
  8. :inline="true"
  9. label-width="110px"
  10. >
  11. <el-form-item label="微信公众号链接" prop="urls" class="!w-100%">
  12. <el-input
  13. v-model="queryParams.urls"
  14. class="!w-75%"
  15. type="textarea"
  16. :rows="2"
  17. placeholder="请输入需要解析的微信公众号链接"
  18. />
  19. <el-button
  20. type="primary"
  21. class="ml-10px"
  22. plain
  23. :loading="loading"
  24. @click="handleAnalysis"
  25. >解析</el-button>
  26. </el-form-item>
  27. </el-form>
  28. </ContentWrap>
  29. <el-row v-if="contents.length" :gutter="20">
  30. <el-col v-for="(content, index) in contents" :key="index" :span="24">
  31. <el-card class="!h-600px" v-loading="!content.data">
  32. <template #header>
  33. <div class="flex items-center justify-between">
  34. <el-text class="flex-1" truncated>{{ content.url }}</el-text>
  35. <el-button
  36. type="primary"
  37. plain
  38. class="mt-10px"
  39. :loading="content.itemLoading"
  40. @click="handleSubmit(content, index)"
  41. >信息提取</el-button>
  42. </div>
  43. </template>
  44. <iframe
  45. :id="content.id"
  46. class="!w-100% !h-[calc(100vh-90px)]"
  47. src=""
  48. frameborder="0"
  49. ></iframe>
  50. </el-card>
  51. </el-col>
  52. </el-row>
  53. </div>
  54. <div v-else class="!h-100%">
  55. <div class="text-right">
  56. <el-button type="primary" plain @click="handleReturn">返回查看解析内容</el-button>
  57. </div>
  58. <el-card class="!h-100% mt-10px">
  59. <div class="!w-192px !h-250px m-auto">
  60. <el-image referrerpolicy="no-referrer" :src="markdown_data.pic_url" class="!w-192px !h-250px" />
  61. </div>
  62. <pre>{{ markdown_data.name_zh }}</pre>
  63. <pre>{{ markdown_data.name_en }}</pre>
  64. <pre>{{ markdown_data.hotel_zh }}</pre>
  65. <pre>{{ markdown_data.title_zh }}</pre>
  66. <pre>{{ markdown_data.detailIntroduction }}</pre>
  67. </el-card>
  68. </div>
  69. </template>
  70. <script setup>
  71. /** 人才采集 网页解析 */
  72. defineOptions({ name: 'WebPageParsing' })
  73. import axios from 'axios'
  74. import TurndownService from 'turndown'
  75. import { talentWebParsingApi } from '@/api/menduner/system/talentMap/webParsing.ts'
  76. import { generateUUID } from '@/utils'
  77. const emit = defineEmits(['analysis', 'reset'])
  78. const props = defineProps({
  79. markDownData: Object
  80. })
  81. const message = useMessage() // 消息弹窗
  82. const { t } = useI18n() // 国际化
  83. const markdown_data = ref({})
  84. watch(() => props.markDownData, val => {
  85. markdown_data.value = val
  86. }, { deep: true })
  87. const loading = ref(false)
  88. const queryParams = reactive({
  89. // urls: 'https://mp.weixin.qq.com/s/JZ5qxaj9vXsEsswxxD1djA'
  90. urls: 'https://mp.weixin.qq.com/s/vQLWlSB6DzqSewtBLkk_kQ'
  91. })
  92. const queryFormRef = ref()
  93. const contents = ref([])
  94. const drawer = ref(false)
  95. // 创建转换服务
  96. const turndownService = new TurndownService({
  97. headingStyle: 'atx',
  98. codeBlockStyle: 'fenced',
  99. bulletListMarker: '-'
  100. })
  101. // 添加自定义规则处理微信公众号特有内容
  102. turndownService.addRule('wechatImages', {
  103. filter: 'img',
  104. replacement: (content, node) => {
  105. const alt = node.getAttribute('alt') || ''
  106. const src = node.getAttribute('src') || node.getAttribute('data-src') || ''
  107. return `![${alt}](${src})`
  108. }
  109. })
  110. // 提取主要内容并转换为Markdown
  111. const wechatHtmlToMarkdown = (html) => {
  112. // 创建一个临时DOM解析器
  113. const parser = new DOMParser()
  114. const doc = parser.parseFromString(html, 'text/html')
  115. // 提取正文内容 - 微信公众号文章通常在id="js_content"的div中
  116. const content = doc.querySelector('#js_content') || doc.body
  117. // 移除不需要的元素
  118. const elementsToRemove = [
  119. 'script', 'style', 'iframe', 'button',
  120. '.qr_code', '.rich_media_extra', '.copyright'
  121. ]
  122. elementsToRemove.forEach(selector => {
  123. content.querySelectorAll(selector).forEach(el => el.remove())
  124. })
  125. // 转换为Markdown
  126. return turndownService.turndown(content.innerHTML)
  127. }
  128. // 转换为markdown格式
  129. const handleConvert = (res) => {
  130. if (!res.data) return
  131. const result = wechatHtmlToMarkdown(res.data)
  132. if (!result) return message.warning('转换失败')
  133. return result
  134. }
  135. function extractPublishTime(doc, html) {
  136. // 1. 通过 id
  137. let timeEl = doc.getElementById('publish_time')
  138. if (timeEl && timeEl.innerText) return timeEl.innerText.trim()
  139. // 2. 通过 class
  140. let metaEls = doc.querySelectorAll('.rich_media_meta.rich_media_meta_text')
  141. for (let el of metaEls) {
  142. if (el.innerText && /\d{4}年\d{1,2}月\d{1,2}日/.test(el.innerText)) {
  143. return el.innerText.trim()
  144. }
  145. }
  146. // 3. 通过 meta 标签
  147. let meta = doc.querySelector('meta[property="article:published_time"]')
  148. if (meta && meta.content) return meta.content.trim()
  149. // 4. 通过正则从 html 里提取
  150. let match = html.match(/(\d{4}年\d{1,2}月\d{1,2}日)/)
  151. if (match) return match[1]
  152. return ''
  153. }
  154. function tryExtractPublishTime(doc, html, cb, maxTry = 10, interval = 200) {
  155. let tryCount = 0
  156. const timer = setInterval(() => {
  157. const publishTime = extractPublishTime(doc, html)
  158. if (publishTime || tryCount >= maxTry) {
  159. clearInterval(timer)
  160. cb(publishTime)
  161. }
  162. tryCount++
  163. }, interval)
  164. }
  165. // 查看原网页
  166. const showPage = (res) => {
  167. if (res.data) {
  168. let html = res.data
  169. html = html.replace(/data-src/g, 'src') // 将 data-src 转化为 src
  170. // .replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/g, '') // 移除HTML内容中所有的<script>标签,这样可以避免在iframe中执行潜在的不受信任的脚本。
  171. .replace(/https/g, 'http') // 将HTML内容中所有的https替换为http,可能是为了避免在HTTPS环境下加载非HTTPS资源导致浏览器警告
  172. nextTick(() => {
  173. const iframe = document.getElementById(res.id)
  174. if (!iframe) return
  175. const doc = iframe.contentDocument || iframe.document
  176. // 设置 iframe 中请求不发送 referrer,以绕过图片防盗链
  177. const htmlArr = html.split('</head>')
  178. const html_src_add = htmlArr[0] + '<meta name="referrer" content="never"></head>' + htmlArr[1]
  179. doc.open()
  180. doc.write(html_src_add)
  181. doc.close()
  182. // 通过延时获取文档高度赋值Iframe去除滚动条,根据实际情况增加延时时间
  183. setTimeout(() => {
  184. const jsContent = doc.getElementById('js_content')
  185. if (jsContent) {
  186. jsContent.style.visibility = 'visible'
  187. jsContent.style.opacity = 1
  188. }
  189. }, 100)
  190. // 获取发布时间
  191. setTimeout(() => {
  192. tryExtractPublishTime(doc, html, (publishTime) => {
  193. if (publishTime) {
  194. res.publish_time = publishTime
  195. .replace("年", "-")
  196. .replace("月", "-")
  197. .replace("日", "")
  198. .split(" ")[0];
  199. console.log(publishTime, '发布时间', res.publish_time)
  200. }
  201. });
  202. }, 100); // 先等100ms让iframe初步渲染,再开始轮询
  203. })
  204. }
  205. }
  206. // 解析
  207. const handleAnalysis = async () => {
  208. if (!queryParams.urls) return
  209. // 重置右侧标签及表单
  210. emit('reset')
  211. if (contents.value && contents.value.length) {
  212. const isAnalysis = contents.value.some(e => e.itemLoading)
  213. if (isAnalysis) return message.warning('正在提取信息中,请稍后再试')
  214. }
  215. contents.value = []
  216. loading.value = true
  217. const urlArr = queryParams.urls.split(',').map(url => url.trim()).filter(Boolean)
  218. const isWeChatUrl = urlArr.every(url => url.includes('https://mp.weixin.qq.com'))
  219. if (!isWeChatUrl) {
  220. message.warning('请输入微信公众文章链接')
  221. return
  222. }
  223. if (urlArr.length > 1) return message.warning('只支持单个链接解析')
  224. const base_url = import.meta.env.VITE_NODE_BASE_URL
  225. axios.post(`${base_url}/process-urls`, { urlArr }, { timeout: 60000 }).then(res => {
  226. if (!res?.data || !res?.data?.contents || !res?.data?.contents.length) return
  227. const list = res?.data?.contents
  228. list.forEach(e => {
  229. contents.value.push({
  230. ...e,
  231. publish_time: null,
  232. itemLoading: false,
  233. id: generateUUID(),
  234. markdown_text: handleConvert(e)
  235. })
  236. })
  237. contents.value.forEach(e => {
  238. showPage(e)
  239. })
  240. }).catch(err => {
  241. console.log(err, 'error');
  242. message.error(err.message)
  243. }).finally(_ => {
  244. loading.value = false
  245. })
  246. }
  247. // 信息提取
  248. const extractIndex = ref(0)
  249. const handleSubmit = async (content, index) => {
  250. extractIndex.value = index
  251. if (loading.value) return message.warning('正在解析中,请稍后再试')
  252. if (!content.markdown_text) return
  253. content.itemLoading = true
  254. const { markdown_text, publish_time } = content
  255. if (!publish_time) {
  256. message.warning('发布时间不能为空')
  257. content.itemLoading = false
  258. return
  259. }
  260. try {
  261. const data = await talentWebParsingApi.saveMarkdownContent({ markdown_text, publish_time })
  262. emit('analysis', data ?? [])
  263. message.success('信息提取成功')
  264. } finally {
  265. content.itemLoading = false
  266. }
  267. }
  268. // 返回查看解析内容
  269. const handleReturn = () => {
  270. markdown_data.value = {}
  271. showPage(contents.value[extractIndex.value])
  272. }
  273. </script>
  274. <style scoped>
  275. </style>