webAnalysis.vue 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282
  1. <template>
  2. <ContentWrap>
  3. <el-form
  4. class="-mb-15px"
  5. :model="queryParams"
  6. ref="queryFormRef"
  7. :inline="true"
  8. label-width="110px"
  9. >
  10. <el-form-item label="微信公众号链接" prop="urls" class="!w-100%">
  11. <el-input
  12. v-model="queryParams.urls"
  13. class="!w-75%"
  14. type="textarea"
  15. :rows="2"
  16. placeholder="请输入需要解析的微信公众号链接"
  17. />
  18. <el-button
  19. type="primary"
  20. class="ml-10px"
  21. plain
  22. @click="handleAnalysis"
  23. >解析</el-button>
  24. </el-form-item>
  25. </el-form>
  26. </ContentWrap>
  27. <el-row v-if="contents.length" :gutter="20">
  28. <el-col v-for="(content, index) in contents" :key="index" :span="24">
  29. <el-card class="!h-600px" v-loading="!content.data">
  30. <template #header>
  31. <div class="flex items-center justify-between">
  32. <el-text class="flex-1" truncated>{{ content.url }}</el-text>
  33. <el-button
  34. type="primary"
  35. plain
  36. class="mt-10px"
  37. @click="handleSubmit(content, index)"
  38. >信息提取</el-button>
  39. </div>
  40. </template>
  41. <iframe
  42. :id="content.id"
  43. class="!w-100% !h-[calc(100vh-90px)]"
  44. src=""
  45. frameborder="0"
  46. ></iframe>
  47. </el-card>
  48. </el-col>
  49. </el-row>
  50. </template>
  51. <script setup>
  52. /** 人才采集 网页解析 */
  53. defineOptions({ name: 'WebPageParsing' })
  54. import axios from 'axios'
  55. import TurndownService from 'turndown'
  56. import { talentWebParsingApi } from '@/api/menduner/system/talentMap/webParsing.ts'
  57. import { generateUUID } from '@/utils'
  58. import { ElLoading } from 'element-plus'
  59. const emit = defineEmits(['analysis', 'reset'])
  60. const message = useMessage() // 消息弹窗
  61. const { t } = useI18n() // 国际化
  62. const queryParams = reactive({
  63. // urls: 'https://mp.weixin.qq.com/s/JZ5qxaj9vXsEsswxxD1djA' // https://mp.weixin.qq.com/s/R1aJpn9z-Jf0dk9ttoYYeg
  64. // urls: 'https://mp.weixin.qq.com/s/vQLWlSB6DzqSewtBLkk_kQ'
  65. urls: ''
  66. })
  67. const queryFormRef = ref()
  68. const contents = ref([])
  69. const drawer = ref(false)
  70. // 创建转换服务
  71. const turndownService = new TurndownService({
  72. headingStyle: 'atx',
  73. codeBlockStyle: 'fenced',
  74. bulletListMarker: '-'
  75. })
  76. // 添加自定义规则处理微信公众号特有内容
  77. turndownService.addRule('wechatImages', {
  78. filter: 'img',
  79. replacement: (content, node) => {
  80. const alt = node.getAttribute('alt') || ''
  81. const src = node.getAttribute('src') || node.getAttribute('data-src') || ''
  82. return `![${alt}](${src})`
  83. }
  84. })
  85. // 提取主要内容并转换为Markdown
  86. const wechatHtmlToMarkdown = (html) => {
  87. // 创建一个临时DOM解析器
  88. const parser = new DOMParser()
  89. const doc = parser.parseFromString(html, 'text/html')
  90. // 提取正文内容 - 微信公众号文章通常在id="js_content"的div中
  91. const content = doc.querySelector('#img_content') || doc.body
  92. // 移除不需要的元素
  93. const elementsToRemove = [
  94. 'script', 'style', 'iframe', 'button',
  95. '.qr_code', '.rich_media_extra', '.copyright'
  96. ]
  97. elementsToRemove.forEach(selector => {
  98. content.querySelectorAll(selector).forEach(el => el.remove())
  99. })
  100. // 转换为Markdown
  101. return turndownService.turndown(content.innerHTML)
  102. }
  103. // 转换为markdown格式
  104. const handleConvert = (res) => {
  105. if (!res.data) return
  106. const result = wechatHtmlToMarkdown(res.data)
  107. if (!result) return message.warning('转换失败')
  108. return result
  109. }
  110. function extractPublishTime(doc, html) {
  111. // 1. 通过 id
  112. let timeEl = doc.getElementById('publish_time')
  113. if (timeEl && timeEl.innerText) return timeEl.innerText.trim()
  114. // 2. 通过 class
  115. let metaEls = doc.querySelectorAll('.rich_media_meta.rich_media_meta_text')
  116. for (let el of metaEls) {
  117. if (el.innerText && /\d{4}年\d{1,2}月\d{1,2}日/.test(el.innerText)) {
  118. return el.innerText.trim()
  119. }
  120. }
  121. // 3. 通过 meta 标签
  122. let meta = doc.querySelector('meta[property="article:published_time"]')
  123. if (meta && meta.content) return meta.content.trim()
  124. // 4. 通过正则从 html 里提取
  125. let match = html.match(/(\d{4}年\d{1,2}月\d{1,2}日)/)
  126. if (match) return match[1]
  127. return ''
  128. }
  129. function tryExtractPublishTime(doc, html, cb, maxTry = 10, interval = 200) {
  130. let tryCount = 0
  131. const timer = setInterval(() => {
  132. const publishTime = extractPublishTime(doc, html)
  133. if (publishTime || tryCount >= maxTry) {
  134. clearInterval(timer)
  135. cb(publishTime)
  136. }
  137. tryCount++
  138. }, interval)
  139. }
  140. // 查看原网页
  141. const showPage = (res) => {
  142. if (res.data) {
  143. let html = res.data
  144. html = html.replace(/data-src/g, 'src') // 将 data-src 转化为 src
  145. // .replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/g, '') // 移除HTML内容中所有的<script>标签,这样可以避免在iframe中执行潜在的不受信任的脚本。
  146. .replace(/https/g, 'http') // 将HTML内容中所有的https替换为http,可能是为了避免在HTTPS环境下加载非HTTPS资源导致浏览器警告
  147. nextTick(() => {
  148. const iframe = document.getElementById(res.id)
  149. if (!iframe) return
  150. const doc = iframe.contentDocument || iframe.document
  151. // 设置 iframe 中请求不发送 referrer,以绕过图片防盗链
  152. const htmlArr = html.split('</head>')
  153. const html_src_add = htmlArr[0] + '<meta name="referrer" content="never"></head>' + htmlArr[1]
  154. doc.open()
  155. doc.write(html_src_add)
  156. doc.close()
  157. // 通过延时获取文档高度赋值Iframe去除滚动条,根据实际情况增加延时时间
  158. setTimeout(() => {
  159. const jsContent = doc.getElementById('js_content')
  160. if (jsContent) {
  161. jsContent.style.visibility = 'visible'
  162. jsContent.style.opacity = 1
  163. }
  164. }, 100)
  165. // 获取发布时间
  166. setTimeout(() => {
  167. tryExtractPublishTime(doc, html, (publishTime) => {
  168. if (publishTime) {
  169. res.publish_time = publishTime
  170. .replace("年", "-")
  171. .replace("月", "-")
  172. .replace("日", "")
  173. .split(" ")[0];
  174. console.log(publishTime, '发布时间', res.publish_time)
  175. }
  176. });
  177. }, 100); // 先等100ms让iframe初步渲染,再开始轮询
  178. })
  179. }
  180. }
  181. // 解析
  182. const handleAnalysis = async () => {
  183. if (!queryParams.urls) return
  184. if (contents.value?.length > 0) {
  185. await message.confirm('是否确认重新解析?确认后当前内容将会清空!')
  186. // 重置右侧标签及表单
  187. emit('reset')
  188. }
  189. const loading = ElLoading.service({
  190. lock: true,
  191. text: '正在解析中...',
  192. background: 'rgba(0, 0, 0, 0.7)',
  193. })
  194. contents.value = []
  195. const urlArr = queryParams.urls.split(',').map(url => url.trim()).filter(Boolean)
  196. const isWeChatUrl = urlArr.every(url => url.includes('https://mp.weixin.qq.com'))
  197. if (!isWeChatUrl) {
  198. message.warning('请输入微信公众文章链接')
  199. return
  200. }
  201. if (urlArr.length > 1) return message.warning('只支持单个链接解析')
  202. const base_url = import.meta.env.VITE_NODE_BASE_URL
  203. axios.post(`${base_url}/process-urls`, { urlArr }, { timeout: 60000 }).then(res => {
  204. if (!res?.data || !res?.data?.contents || !res?.data?.contents.length) return
  205. const list = res?.data?.contents
  206. list.forEach(e => {
  207. contents.value.push({
  208. ...e,
  209. publish_time: null,
  210. id: generateUUID(),
  211. markdown_text: handleConvert(e)
  212. })
  213. })
  214. contents.value.forEach(e => {
  215. showPage(e)
  216. })
  217. }).catch(err => {
  218. console.log(err, 'error');
  219. message.error(err.message)
  220. }).finally(_ => {
  221. loading.close()
  222. })
  223. }
  224. // 信息提取
  225. const handleSubmit = async (content, index) => {
  226. if (!content.markdown_text) return
  227. const loading = ElLoading.service({
  228. lock: true,
  229. text: '信息正在提取中...',
  230. background: 'rgba(0, 0, 0, 0.7)',
  231. })
  232. const { markdown_text, publish_time } = content
  233. if (!publish_time) {
  234. message.warning('发布时间不能为空')
  235. loading.close()
  236. return
  237. }
  238. try {
  239. const data = await talentWebParsingApi.saveMarkdownContent({ markdown_text, publish_time })
  240. emit('analysis', data ?? [], markdown_text)
  241. message.success('信息提取成功')
  242. } finally {
  243. loading.close()
  244. }
  245. }
  246. </script>
  247. <style scoped>
  248. </style>