Jelajahi Sumber

网页解析

Xiao_123 1 Minggu lalu
induk
melakukan
3c8526537f

+ 1 - 0
src/api/menduner/system/talentMap/webParsing.ts

@@ -6,6 +6,7 @@ export const talentWebParsingApi = {
 		return await request.post({ 
 			url: `/api/parse/webpage-parse`,
 			data,
+			timeout: 300000,
 			baseURL: import.meta.env.VITE_BASE_URL
 		})
 	}

+ 165 - 52
src/views/menduner/system/talentMap/maintenance/gather/components/webAnalysis.vue

@@ -1,38 +1,72 @@
 <template>
-	<ContentWrap>
-		<el-form
-			class="-mb-15px"
-			:model="queryParams"
-			ref="queryFormRef"
-			:inline="true"
-			label-width="110px"
-		>
-			<el-form-item label="微信公众号链接" prop="urls" class="!w-100%">
-				<el-input
-					v-model="queryParams.urls"
-					class="!w-75%"
-					type="textarea"
-					:rows="2"
-					placeholder="请输入需要解析的微信公众号链接"
-					/>
-					<el-button type="primary" class="ml-10px" plain :loading="loading" @click="handleAnalysis">解析</el-button>
-			</el-form-item>
-		</el-form>
-	</ContentWrap>
-
-	<el-row v-if="contents.length" :gutter="20">
-		<el-col v-for="(content, index) in contents" :key="index" :span="24">
-			<el-card class="!h-600px" v-loading="!content.markdown_text">
-				<template #header>
-					<div class="flex items-center justify-between">
-						<el-text class="flex-1" truncated>{{ content.url }}</el-text>
-						<el-button type="primary" plain class="mt-10px" :loading="content.itemLoading" @click="handleSubmit(content)">信息提取</el-button>
-					</div>
-				</template>
-				<iframe :id="content.id" class="!w-100% !h-[calc(100vh-90px)]" src="" frameborder="0"></iframe>
-			</el-card>
-		</el-col>
-	</el-row>
+  <div v-if="!markdown_data || !Object.keys(markdown_data)?.length">
+    <ContentWrap>
+      <el-form
+        class="-mb-15px"
+        :model="queryParams"
+        ref="queryFormRef"
+        :inline="true"
+        label-width="110px"
+      >
+        <el-form-item label="微信公众号链接" prop="urls" class="!w-100%">
+          <el-input
+            v-model="queryParams.urls"
+            class="!w-75%"
+            type="textarea"
+            :rows="2"
+            placeholder="请输入需要解析的微信公众号链接"
+          />
+          <el-button
+						type="primary"
+						class="ml-10px"
+						plain
+						:loading="loading"
+						@click="handleAnalysis"
+					>解析</el-button>
+        </el-form-item>
+      </el-form>
+    </ContentWrap>
+
+    <el-row v-if="contents.length" :gutter="20">
+      <el-col v-for="(content, index) in contents" :key="index" :span="24">
+        <el-card class="!h-600px" v-loading="!content.data">
+          <template #header>
+            <div class="flex items-center justify-between">
+              <el-text class="flex-1" truncated>{{ content.url }}</el-text>
+              <el-button
+                type="primary"
+                plain
+                class="mt-10px"
+                :loading="content.itemLoading"
+                @click="handleSubmit(content, index)"
+              >信息提取</el-button>
+            </div>
+          </template>
+          <iframe
+            :id="content.id"
+            class="!w-100% !h-[calc(100vh-90px)]"
+            src=""
+            frameborder="0"
+          ></iframe>
+        </el-card>
+      </el-col>
+    </el-row>
+  </div>
+  <div v-else class="!h-100%">
+    <div class="text-right">
+      <el-button type="primary" plain @click="handleReturn">返回查看解析内容</el-button>
+    </div>
+    <el-card class="!h-100% mt-10px">
+      <div class="!w-192px !h-250px m-auto">
+				<el-image referrerpolicy="no-referrer" :src="markdown_data.pic_url" class="!w-192px !h-250px" />
+			</div>
+			<pre>{{ markdown_data.name_zh }}</pre>
+			<pre>{{ markdown_data.name_en }}</pre>
+			<pre>{{ markdown_data.hotel_zh }}</pre>
+			<pre>{{ markdown_data.title_zh }}</pre>
+			<pre>{{ markdown_data.detailIntroduction }}</pre>
+    </el-card>
+  </div>
 </template>
 
 <script setup>
@@ -43,13 +77,22 @@ import TurndownService from 'turndown'
 import { talentWebParsingApi } from '@/api/menduner/system/talentMap/webParsing.ts'
 import { generateUUID } from '@/utils'
 
-const emit = defineEmits(['analysis'])
+const emit = defineEmits(['analysis', 'reset'])
+const props = defineProps({
+	markDownData: Object
+})
 const message = useMessage() // 消息弹窗
 const { t } = useI18n() // 国际化
 
+const markdown_data = ref({})
+watch(() => props.markDownData, val => {
+	markdown_data.value = val
+}, { deep: true })
+
 const loading = ref(false)
 const queryParams = reactive({
-	urls: ''
+	// urls: 'https://mp.weixin.qq.com/s/JZ5qxaj9vXsEsswxxD1djA'
+	urls: 'https://mp.weixin.qq.com/s/vQLWlSB6DzqSewtBLkk_kQ'
 })
 const queryFormRef = ref()
 const contents = ref([])
@@ -77,20 +120,20 @@ const wechatHtmlToMarkdown = (html) => {
   // 创建一个临时DOM解析器
   const parser = new DOMParser()
   const doc = parser.parseFromString(html, 'text/html')
-  
+
   // 提取正文内容 - 微信公众号文章通常在id="js_content"的div中
   const content = doc.querySelector('#js_content') || doc.body
-  
+
   // 移除不需要的元素
   const elementsToRemove = [
-    'script', 'style', 'iframe', 'button', 
+    'script', 'style', 'iframe', 'button',
     '.qr_code', '.rich_media_extra', '.copyright'
   ]
-  
+
   elementsToRemove.forEach(selector => {
     content.querySelectorAll(selector).forEach(el => el.remove())
   })
-  
+
   // 转换为Markdown
   return turndownService.turndown(content.innerHTML)
 }
@@ -103,14 +146,49 @@ const handleConvert = (res) => {
 	return result
 }
 
+function extractPublishTime(doc, html) {
+  // 1. 通过 id
+  let timeEl = doc.getElementById('publish_time')
+  if (timeEl && timeEl.innerText) return timeEl.innerText.trim()
+
+  // 2. 通过 class
+  let metaEls = doc.querySelectorAll('.rich_media_meta.rich_media_meta_text')
+  for (let el of metaEls) {
+    if (el.innerText && /\d{4}年\d{1,2}月\d{1,2}日/.test(el.innerText)) {
+      return el.innerText.trim()
+    }
+  }
+
+  // 3. 通过 meta 标签
+  let meta = doc.querySelector('meta[property="article:published_time"]')
+  if (meta && meta.content) return meta.content.trim()
+
+  // 4. 通过正则从 html 里提取
+  let match = html.match(/(\d{4}年\d{1,2}月\d{1,2}日)/)
+  if (match) return match[1]
+
+  return ''
+}
+
+function tryExtractPublishTime(doc, html, cb, maxTry = 10, interval = 200) {
+  let tryCount = 0
+  const timer = setInterval(() => {
+    const publishTime = extractPublishTime(doc, html)
+    if (publishTime || tryCount >= maxTry) {
+      clearInterval(timer)
+      cb(publishTime)
+    }
+    tryCount++
+  }, interval)
+}
+
 // 查看原网页
 const showPage = (res) => {
   if (res.data) {
     let html = res.data
     html = html.replace(/data-src/g, 'src') // 将 data-src 转化为 src
-      .replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/g, '') // 移除HTML内容中所有的<script>标签,这样可以避免在iframe中执行潜在的不受信任的脚本。
+      // .replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/g, '') // 移除HTML内容中所有的<script>标签,这样可以避免在iframe中执行潜在的不受信任的脚本。
       .replace(/https/g, 'http') // 将HTML内容中所有的https替换为http,可能是为了避免在HTTPS环境下加载非HTTPS资源导致浏览器警告
-    
     nextTick(() => {
       const iframe = document.getElementById(res.id)
       if (!iframe) return
@@ -121,25 +199,43 @@ const showPage = (res) => {
       doc.open()
       doc.write(html_src_add)
       doc.close()
+
       // 通过延时获取文档高度赋值Iframe去除滚动条,根据实际情况增加延时时间
       setTimeout(() => {
         const jsContent = doc.getElementById('js_content')
-        if (jsContent) {
+				if (jsContent) {
           jsContent.style.visibility = 'visible'
           jsContent.style.opacity = 1
         }
       }, 100)
+
+      // 获取发布时间
+      setTimeout(() => {
+        tryExtractPublishTime(doc, html, (publishTime) => {
+          if (publishTime) {
+            res.publish_time = publishTime
+              .replace("年", "-")
+              .replace("月", "-")
+              .replace("日", "")
+              .split(" ")[0];
+            console.log(publishTime, '发布时间', res.publish_time)
+          }
+        });
+      }, 100); // 先等100ms让iframe初步渲染,再开始轮询
     })
   }
 }
 
-// 提取
+// 解析
 const handleAnalysis = async () => {
 	if (!queryParams.urls) return
 
+  // 重置右侧标签及表单
+  emit('reset')
+
 	if (contents.value && contents.value.length) {
-		const isAnalysis = contents.value.every(e => e.itemLoading)
-		return message.warning('正在解析中,请稍后再试')
+		const isAnalysis = contents.value.some(e => e.itemLoading)
+		if (isAnalysis) return message.warning('正在提取信息中,请稍后再试')
 	}
 
 	contents.value = []
@@ -153,6 +249,8 @@ const handleAnalysis = async () => {
 		return
 	}
 
+	if (urlArr.length > 1) return message.warning('只支持单个链接解析')
+
 	const base_url = import.meta.env.VITE_NODE_BASE_URL
 	axios.post(`${base_url}/process-urls`, { urlArr }, { timeout: 60000 }).then(res => {
 		if (!res?.data || !res?.data?.contents || !res?.data?.contents.length) return
@@ -160,6 +258,7 @@ const handleAnalysis = async () => {
 		list.forEach(e => {
 			contents.value.push({
 				...e,
+				publish_time: null,
 				itemLoading: false,
 				id: generateUUID(),
 				markdown_text: handleConvert(e)
@@ -177,20 +276,34 @@ const handleAnalysis = async () => {
 	})
 }
 
-// 解析
-const handleSubmit = async (content) => {
-	if (loading.value) return message.warning('正在提取中,请稍后再试')
+// 信息提取
+const extractIndex = ref(0)
+const handleSubmit = async (content, index) => {
+	extractIndex.value = index
+	if (loading.value) return message.warning('正在解析中,请稍后再试')
 	if (!content.markdown_text) return
 
 	content.itemLoading = true
+  const { markdown_text, publish_time } = content
+	if (!publish_time) {
+		message.warning('发布时间不能为空')
+		content.itemLoading = false
+		return
+	}
 	try {
-		const data = await talentWebParsingApi.saveMarkdownContent({ markdown_text: content.markdown_text })
-		emit('analysis', data ?? {})
+		const data = await talentWebParsingApi.saveMarkdownContent({ markdown_text, publish_time })
+		emit('analysis', data ?? [])
 		message.success('信息提取成功')
 	} finally {
 		content.itemLoading = false
 	}
 }
+
+// 返回查看解析内容
+const handleReturn = () => {
+	markdown_data.value =  {}
+	showPage(contents.value[extractIndex.value])
+}
 </script>
 
 <style scoped>

+ 75 - 7
src/views/menduner/system/talentMap/maintenance/gather/index.vue

@@ -133,7 +133,7 @@
     <!-- 解析回显 -->
     <Dialog :title="radioObject[radioValue]" v-model="dialog_analysisInfo" width="90%" @close="dialog_analysisInfo = false">
       <div class="analysisInfoBox">
-        <div class="analysisFile">
+        <div class="analysisFile" :style="{'width': showFormPage || tagList?.length > 0 ? '50%' :'100%'}">
           <!-- 门墩儿人才库 -->
           <template v-if="radioValue === 'menduner'">
             <el-tabs v-model="activeName" type="border-card">
@@ -215,14 +215,41 @@
           </template>
           <!-- 网页解析 -->
           <template v-if="radioValue === 'web'">
-            <webAnalysis v-if="showWebAnalysis" @analysis="val => formData = val" />
+            <webAnalysis
+              v-if="showWebAnalysis"
+              :markDownData="markDownData"
+              @analysis="val => tagList = val"
+              @reset="handleWebClear(), showFormPage = false, formData = {}"
+            />
           </template>
         </div>
-        <FormPage ref="FormPageRef" :analysisType="analysisType" :itemData="formData" />
+        <div class="flex-1">
+          <div class="tagBox mb-10px" v-if="tagList?.length">
+            <el-tag 
+              type="primary" 
+              round
+              size="large"
+              v-for="(val, index) in tagList"
+              :key="index + val.name_zh"
+              class="mr-10px cursor-pointer mb-10px"
+              @click="handleTagClick(val, index)"
+            >
+              {{ val.name_zh }}
+              <Icon v-if="index === tagCurrentIndex" icon="ep:check" class="ml-5px" /> 
+            </el-tag>
+          </div>
+          <FormPage
+            v-if="showFormPage"
+            ref="FormPageRef"
+            :formType="analysisType"
+            :itemData="formData"
+          />
+          <div v-if="tagList?.length && !tagCurrentIndex" class="!h-100% flex items-center justify-center">请点击上方人才姓名进行查看编辑</div>
+        </div>
       </div>
       <template #footer>
-        <el-button @click="handleSave" type="success" :disabled="analysisLoading">保 存</el-button>
-        <el-button @click="dialog_analysisInfo = false">取 消</el-button>
+        <el-button @click="handleSave" type="success" :loading="formLoading">保 存</el-button>
+        <el-button @click="dialog_analysisInfo = false, handleWebClear">取 消</el-button>
       </template>
     </Dialog>
   </ContentWrap>
@@ -305,6 +332,30 @@ const handleDelete = async (id) => {
   } catch {}
 }
 
+// 网页解析
+const tagCurrentIndex = ref(null)
+const tagList = ref([])
+const markDownData = ref({})
+const handleTagClick = (val, index) => {
+  tagCurrentIndex.value = index
+  formData.value = val
+  markDownData.value = {
+    name_zh: val.name_zh,
+    name_en: val.name_en,
+    hotel_zh: val.hotel_zh,
+    title_zh: val.title_zh,
+    pic_url: val.pic_url,
+    detailIntroduction: ''
+  }
+  if (!showFormPage.value) showFormPage.value = true
+}
+
+const handleWebClear = () => {
+  tagCurrentIndex.value = null
+  markDownData.value = {}
+  tagList.value = []
+}
+
 /** 编辑 */
 const { push } = useRouter()
 const handleEdit = async (item) => {
@@ -354,9 +405,12 @@ const activeName = ref('info')
 const dialog_analysisInfo = ref(false)
 const formLoading = ref(false)
 const analysisType = ref('')
+const showFormPage = ref(true)
 const FormPageRef = ref(null)
 const mergeFormRef = ref() // 合并表单 Ref
 const handleSave = async () => {
+  if (!FormPageRef.value) return message.warning('请将表单信息完善后再提交')
+
   const params = { ...FormPageRef.value.formQuery, type: radioValue.value }
   if (!params.name_zh) return message.warning('请填写姓名!')
   
@@ -402,6 +456,7 @@ const handleSave = async () => {
     cardFileQuery.value = null
     formLoading.value = false
     openSearch.value = false
+    handleWebClear()
   }
 }
 
@@ -563,6 +618,7 @@ const handleSelect = () => {
     return
   }
   if (type === 'web') {
+    showFormPage.value = false
     showWebAnalysis.value = true
     dialog_analysisInfo.value = true
     return
@@ -583,6 +639,7 @@ const defaultValue = radioList.value[0].value // 默认选中
 const radioValue = ref(defaultValue)
 // 新增解析
 const handleAdd = () => {
+  handleWebClear()
   cardUploadRow.value = null
   cardImgUrl.value = null
   analysisLoading.value = false
@@ -603,8 +660,8 @@ onMounted(() => {
 .analysisInfoBox {
   display: flex;
   .analysisFile {
-    width: 50%;
-    max-height: 70vh;
+    // width: 50%;
+    // max-height: 70vh;
     padding-right: 12px;
     overflow: auto;
   }
@@ -619,4 +676,15 @@ onMounted(() => {
     background-color: aliceblue;
   }
 }
+.tagBox {
+  padding: 12px;
+  border: 1px dashed #409EFF;
+  border-radius: 4px;
+}
+:deep {
+  .el-tag__content {
+    display: flex;
+    align-items: center;
+  }
+}
 </style>