Browse Source

人才采集:新任命解析获取发布时间

Xiao_123 3 ngày trước cách đây
mục cha
commit
8011bbed87

+ 48 - 48
src/views/menduner/system/talentMap/maintenance/gather/components/webAnalysis.vue

@@ -123,41 +123,41 @@ const wechatHtmlToMarkdown = (html, filename = '新任命.md') => {
 // 	return result
 // }
 
-// function extractPublishTime(doc, html) {
-//   // 1. 通过 id
-//   let timeEl = doc.getElementById('publish_time')
-//   if (timeEl && timeEl.innerText) return timeEl.innerText.trim()
-
-//   // 2. 通过 class
-//   let metaEls = doc.querySelectorAll('.rich_media_meta.rich_media_meta_text')
-//   for (let el of metaEls) {
-//     if (el.innerText && /\d{4}年\d{1,2}月\d{1,2}日/.test(el.innerText)) {
-//       return el.innerText.trim()
-//     }
-//   }
-
-//   // 3. 通过 meta 标签
-//   let meta = doc.querySelector('meta[property="article:published_time"]')
-//   if (meta && meta.content) return meta.content.trim()
-
-//   // 4. 通过正则从 html 里提取
-//   let match = html.match(/(\d{4}年\d{1,2}月\d{1,2}日)/)
-//   if (match) return match[1]
-
-//   return ''
-// }
+function extractPublishTime(doc, html) {
+  // 1. 通过 id
+  let timeEl = doc.getElementById('publish_time')
+  if (timeEl && timeEl.innerText) return timeEl.innerText.trim()
+
+  // 2. 通过 class
+  let metaEls = doc.querySelectorAll('.rich_media_meta.rich_media_meta_text')
+  for (let el of metaEls) {
+    if (el.innerText && /\d{4}年\d{1,2}月\d{1,2}日/.test(el.innerText)) {
+      return el.innerText.trim()
+    }
+  }
 
-// function tryExtractPublishTime(doc, html, cb, maxTry = 10, interval = 200) {
-//   let tryCount = 0
-//   const timer = setInterval(() => {
-//     const publishTime = extractPublishTime(doc, html)
-//     if (publishTime || tryCount >= maxTry) {
-//       clearInterval(timer)
-//       cb(publishTime)
-//     }
-//     tryCount++
-//   }, interval)
-// }
+  // 3. 通过 meta 标签
+  let meta = doc.querySelector('meta[property="article:published_time"]')
+  if (meta && meta.content) return meta.content.trim()
+
+  // 4. 通过正则从 html 里提取
+  let match = html.match(/(\d{4}年\d{1,2}月\d{1,2}日)/)
+  if (match) return match[1]
+
+  return ''
+}
+
+function tryExtractPublishTime(doc, html, cb, maxTry = 10, interval = 200) {
+  let tryCount = 0
+  const timer = setInterval(() => {
+    const publishTime = extractPublishTime(doc, html)
+    if (publishTime || tryCount >= maxTry) {
+      clearInterval(timer)
+      cb(publishTime)
+    }
+    tryCount++
+  }, interval)
+}
 
 // 查看原网页
 const showPage = (res) => {
@@ -168,7 +168,7 @@ const showPage = (res) => {
   html = html.replace(/data-src/g, 'src')
     // 需要获取文章发布时间的话需注释下一行代码
     // 移除HTML内容中所有的<script>标签,这样可以避免在iframe中执行潜在的不受信任的脚本。
-    .replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/g, '')
+    // .replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/g, '')
     // 将HTML内容中所有的https替换为http,可能是为了避免在HTTPS环境下加载非HTTPS资源导致浏览器警告
     .replace(/https/g, 'http')
   
@@ -193,18 +193,18 @@ const showPage = (res) => {
     }, 100)
 
     // 获取发布时间
-    // setTimeout(() => {
-    //   tryExtractPublishTime(doc, html, (publishTime) => {
-    //     if (publishTime) {
-    //       res.publish_time = publishTime
-    //         .replace("年", "-")
-    //         .replace("月", "-")
-    //         .replace("日", "")
-    //         .split(" ")[0];
-    //       console.log(publishTime, '发布时间', res.publish_time)
-    //     }
-    //   });
-    // }, 100); // 先等100ms让iframe初步渲染,再开始轮询
+    setTimeout(() => {
+      tryExtractPublishTime(doc, html, (publishTime) => {
+        if (publishTime) {
+          res.publish_time = publishTime
+            .replace("年", "-")
+            .replace("月", "-")
+            .replace("日", "")
+            .split(" ")[0];
+          console.log(publishTime, '发布时间', res.publish_time)
+        }
+      });
+    }, 100); // 先等100ms让iframe初步渲染,再开始轮询
   })
 }
 
@@ -243,7 +243,7 @@ const handleAnalysis = async () => {
 		list.forEach(e => {
 			contents.value.push({
 				...e,
-				// publish_time: null,
+				publish_time: null,
 				id: generateUUID(),
         file: wechatHtmlToMarkdown(e.data)
 				// markdown_text: handleConvert(e)