6 tháng trước cách đây · e7288b026e
--- a/src/views/menduner/system/talentMap/maintenance/gather/components/webAnalysis.vue
+++ b/src/views/menduner/system/talentMap/maintenance/gather/components/webAnalysis.vue
@@ -5,57 +5,39 @@
 
				 			:model="queryParams"
			
 
				 			ref="queryFormRef"
			
 
				 			:inline="true"
			
 
				-			label-width="90px"
			
 
				+			label-position="top"
			
 
				+			label-width="110px"
			
 
				 		>
			
 
				-			<el-form-item label="url抓取数据" prop="urls">
			
 
				+			<el-form-item label="微信公众号链接" prop="urls" class="!w-100%">
			
 
				 				<el-input
			
 
				 					v-model="queryParams.urls"
			
 
				-					class="!w-420px"
			
 
				+					class="!w-80%"
			
 
				 					type="textarea"
			
 
				-					:rows="1"
			
 
				-					placeholder="请输入需要爬取的页面，多个页面请用 ',' 隔开"
			
 
				+					:rows="2"
			
 
				+					placeholder="请输入需要解析的页面"
			
 
				 				/>
			
 
				-			</el-form-item>
			
 
				-			<el-form-item>
			
 
				-				<el-button type="primary" plain :loading="loading" @click="handleExecute">执行</el-button>
			
 
				+				<el-button type="primary" class="ml-10px" plain :loading="loading" @click="handleAnalysis">解析</el-button>
			
 
				 			</el-form-item>
			
 
				 		</el-form>
			
 
				 	</ContentWrap>
			
 
				 
			
 
				 	<ContentWrap v-if="contents.length">
			
 
				 		<el-row gutter="20">
			
 
				-			<el-col v-for="(content, index) in contents" :key="index" :span="12">
			
 
				+			<el-col v-for="(content, index) in contents" :key="index" :span="24">
			
 
				 				<el-card class="!h-500px" v-loading="!content.data">
			
 
				 					<template #header>
			
 
				 						<div class="flex items-center justify-between">
			
 
				 							<el-text class="flex-1" truncated>{{ content.url }}</el-text>
			
 
				-							<div class="!w-85px">
			
 
				+							<!-- <div class="!w-40px">
			
 
				 								<Icon icon="ep:view" size="25" class="ml-10px cursor-pointer" color="#409eff" @click="showPage(content)" />
			
 
				-								<Icon icon="ep:refresh" size="25" class=" ml-18px cursor-pointer" color="#409eff" @click="handleReload(content)" />
			
 
				-							</div>
			
 
				+							</div> -->
			
 
				 						</div>
			
 
				 					</template>
			
 
				-					<div v-if="content.data">
			
 
				-						<template v-if="typeof content.data === 'string'">{{ content.data }}</template>
			
 
				-            <el-tabs v-else v-model="content.tab">
			
 
				-              <el-tab-pane v-for="(v, k) in content.data.data[0]" :key="k" :label="k" :name="k" class="overflow-y-auto !h-360px">
			
 
				-								<template v-if="k === 'html'">
			
 
				-									<div class="position-sticky float-right">
			
 
				-										<el-button
			
 
				-											type="primary"
			
 
				-											class="cursor-pointer"
			
 
				-											@click="content.showHtml = !content.showHtml"
			
 
				-											:icon="SetUp"
			
 
				-											circle
			
 
				-										/>
			
 
				-									</div>
			
 
				-                  <pre v-if="!content.showHtml">{{ v }}</pre>
			
 
				-                  <div v-else v-html="v"></div>
			
 
				-                </template>
			
 
				-                <pre v-else>{{ v || '暂无数据' }}</pre>
			
 
				-							</el-tab-pane>
			
 
				-            </el-tabs>
			
 
				+          <div class="overflow-y-auto !h-360px" v-if="content.data">
			
 
				+            <pre>{{ content.data }}</pre>
			
 
				           </div>
			
 
				+					<el-button type="primary" plain class="mt-10px" @click="showPage(content)">预览</el-button>
			
 
				+					<el-button type="primary" plain class="mt-10px" @click="handleConvert(content)">转换为markdown格式</el-button>
			
 
				 				</el-card>
			
 
				 			</el-col>
			
 
				 		</el-row>
			
@@ -67,82 +49,158 @@
 
				 		:with-header="false"
			
 
				 		:modal="true"
			
 
				 	>
			
 
				-		<iframe class="!w-100% !h-[calc(100vh-90px)]" :src="drawerUrl" frameborder="0"></iframe>
			
 
				+		<iframe id="iFrame" class="!w-100% !h-[calc(100vh-90px)]" src="" frameborder="0"></iframe>
			
 
				+		<el-divider class="!ma-0" />
			
 
				+		<div class="position-sticky left-20px !h-50px lh-50px">
			
 
				+			<el-button type="primary" class="!w-100px" @click="drawer = false">关 闭</el-button>
			
 
				+		</div>
			
 
				+	</el-drawer>
			
 
				+
			
 
				+	<el-drawer
			
 
				+		v-model="showMarkDown"
			
 
				+		class="!w-50vw"
			
 
				+		:with-header="false"
			
 
				+		:modal="true"
			
 
				+	>
			
 
				+		<pre>{{ markdownContent }}</pre>
			
 
				 		<el-divider class="!ma-0" />
			
 
				 		<div class="position-sticky left-20px !h-50px lh-50px">
			
 
				-			<el-button type="primary" class="!w-100px" @click="drawer = false; drawerUrl = ''">关 闭</el-button>
			
 
				+			<el-button type="primary" class="!w-100px" @click="showMarkDown = false; markdownContent = ''">关 闭</el-button>
			
 
				 		</div>
			
 
				 	</el-drawer>
			
 
				 </template>
			
 
				 
			
 
				 <script setup>
			
 
				 /** 人才采集 网页解析 */
			
 
				-import FirecrawlApp from '@mendable/firecrawl-js'
			
 
				-import { SetUp } from '@element-plus/icons-vue'
			
 
				+defineOptions({ name: 'WebPageParsing' })
			
 
				+import axios from 'axios'
			
 
				+import TurndownService from 'turndown'
			
 
				 
			
 
				 const message = useMessage() // 消息弹窗
			
 
				 const { t } = useI18n() // 国际化
			
 
				 
			
 
				 const loading = ref(false)
			
 
				 const queryParams = reactive({
			
 
				-	urls: 'https://mp.weixin.qq.com/s/WeCRR3zN3fPvlGR4t8YFDA'
			
 
				+	urls: 'https://mp.weixin.qq.com/s/vQLWlSB6DzqSewtBLkk_kQ'
			
 
				 })
			
 
				 const queryFormRef = ref()
			
 
				 const contents = ref([])
			
 
				 const drawer = ref(false)
			
 
				-const drawerUrl = ref('')
			
 
				 
			
 
				-const showPage = (content) => {
			
 
				-	drawer.value = true
			
 
				-	drawerUrl.value = content.url
			
 
				+// 创建转换服务
			
 
				+const turndownService = new TurndownService({
			
 
				+  headingStyle: 'atx',
			
 
				+  codeBlockStyle: 'fenced',
			
 
				+  bulletListMarker: '-'
			
 
				+})
			
 
				+
			
 
				+// 添加自定义规则处理微信公众号特有内容
			
 
				+turndownService.addRule('wechatImages', {
			
 
				+  filter: 'img',
			
 
				+  replacement: (content, node) => {
			
 
				+    const alt = node.getAttribute('alt') || ''
			
 
				+    const src = node.getAttribute('src') || node.getAttribute('data-src') || ''
			
 
				+    return `![${alt}](${src})`
			
 
				+  }
			
 
				+})
			
 
				+
			
 
				+// 提取主要内容并转换为Markdown
			
 
				+const wechatHtmlToMarkdown = (html) => {
			
 
				+  // 创建一个临时DOM解析器
			
 
				+  const parser = new DOMParser()
			
 
				+  const doc = parser.parseFromString(html, 'text/html')
			
 
				+  
			
 
				+  // 提取正文内容 - 微信公众号文章通常在id="js_content"的div中
			
 
				+  const content = doc.querySelector('#js_content') || doc.body
			
 
				+  
			
 
				+  // 移除不需要的元素
			
 
				+  const elementsToRemove = [
			
 
				+    'script', 'style', 'iframe', 'button', 
			
 
				+    '.qr_code', '.rich_media_extra', '.copyright'
			
 
				+  ]
			
 
				+  
			
 
				+  elementsToRemove.forEach(selector => {
			
 
				+    content.querySelectorAll(selector).forEach(el => el.remove())
			
 
				+  })
			
 
				+  
			
 
				+  // 转换为Markdown
			
 
				+  return turndownService.turndown(content.innerHTML)
			
 
				 }
			
 
				 
			
 
				-const handleReload = async (content) => {
			
 
				-	content.data = null
			
 
				-	const res = await handleData(queryParams.urls)
			
 
				-	content.tab = 0
			
 
				-	content.data = res
			
 
				+// 转换为markdown格式
			
 
				+const showMarkDown = ref(false)
			
 
				+const markdownContent = ref(null)
			
 
				+const handleConvert = (res) => {
			
 
				+	if (!res.data) return
			
 
				+	const result = wechatHtmlToMarkdown(res.data)
			
 
				+	if (!result) return message.warning('转换失败')
			
 
				+	markdownContent.value = result
			
 
				+	showMarkDown.value = true
			
 
				 }
			
 
				 
			
 
				-const handleData = async (url) => {
			
 
				-	try {
			
 
				-    const app = new FirecrawlApp({ apiKey: 'fc-85c1550c6db64ce4ae8f2d2cd2606e6f' })
			
 
				-    const crawlResponse = await app.crawlUrl(url, {
			
 
				-      limit: 100,
			
 
				-      scrapeOptions: {
			
 
				-        formats: ['markdown', 'html']
			
 
				-      }
			
 
				+// 查看原网页
			
 
				+const showPage = (res) => {
			
 
				+  if (res.data) {
			
 
				+    drawer.value = true
			
 
				+    let html = res.data
			
 
				+    html = html.replace(/data-src/g, 'src') // 将 data-src 转化为 src
			
 
				+      .replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/g, '') // 移除HTML内容中所有的<script>标签，这样可以避免在iframe中执行潜在的不受信任的脚本。
			
 
				+      .replace(/https/g, 'http') // 将HTML内容中所有的https替换为http，可能是为了避免在HTTPS环境下加载非HTTPS资源导致浏览器警告
			
 
				+    
			
 
				+    nextTick(() => {
			
 
				+      const iframe = document.getElementById('iFrame')
			
 
				+      if (!iframe) return
			
 
				+      const doc = iframe.contentDocument || iframe.document
			
 
				+      // 设置 iframe 中请求不发送 referrer，以绕过图片防盗链
			
 
				+      const htmlArr = html.split('</head>')
			
 
				+      const html_src_add = htmlArr[0] + '<meta name="referrer" content="never"></head>' + htmlArr[1]
			
 
				+      doc.open()
			
 
				+      doc.write(html_src_add)
			
 
				+      doc.close()
			
 
				+      // 通过延时获取文档高度赋值Iframe去除滚动条，根据实际情况增加延时时间
			
 
				+      setTimeout(() => {
			
 
				+        const jsContent = doc.getElementById('js_content')
			
 
				+        if (jsContent) {
			
 
				+          jsContent.style.visibility = 'visible'
			
 
				+          jsContent.style.opacity = 1
			
 
				+        }
			
 
				+      }, 100)
			
 
				     })
			
 
				-    if (!crawlResponse.success) {
			
 
				-      throw new Error(`Failed to crawl: ${crawlResponse.error}`)
			
 
				-    }
			
 
				-    return crawlResponse
			
 
				-  } catch (error) {
			
 
				-    return error.message
			
 
				   }
			
 
				 }
			
 
				 
			
 
				-// 执行
			
 
				-const handleExecute = async () => {
			
 
				+// 解析
			
 
				+const handleAnalysis = async () => {
			
 
				 	if (!queryParams.urls) return
			
 
				 	contents.value = []
			
 
				-	const urls = queryParams.urls.split(',').map(url => url.trim()).filter(url => url)
			
 
				-	if (urls.length === 0) return
			
 
				+	loading.value = true
			
 
				 
			
 
				-	urls.forEach(url => {
			
 
				-	  contents.value.push({ url, tab: 'markdown', showHtml: false, data: null })
			
 
				-	})
			
 
				+	const urlArr = queryParams.urls.split(',').map(url => url.trim()).filter(Boolean)
			
 
				 
			
 
				-	const crawlPromises = urls.map(async (url, index) => {
			
 
				-		const res = await handleData(url)
			
 
				-		contents.value[index] = { ...contents.value[index], data: res }
			
 
				+	const isWeChatUrl = urlArr.every(url => url.includes('https://mp.weixin.qq.com'))
			
 
				+	if (!isWeChatUrl) {
			
 
				+		message.warning('请输入微信公众文章链接')
			
 
				+		return
			
 
				+	}
			
 
				+
			
 
				+	const base_url = import.meta.env.VITE_NODE_BASE_URL
			
 
				+
			
 
				+	axios.post(`${base_url}/process-urls`, { urlArr }, { timeout: 60000 }).then(res => {
			
 
				+		console.log(res, '解析内容')
			
 
				+		contents.value = res?.data?.contents ?? []
			
 
				+	}).catch(err => {
			
 
				+		console.log(err, '解析失败');
			
 
				+		message.error(err.message)
			
 
				+	}).finally(_ => {
			
 
				+		loading.value = false
			
 
				 	})
			
 
				+}
			
 
				+</script>
			
 
				 
			
 
				-	try {
			
 
				-		await Promise.all(crawlPromises)
			
 
				-		console.log('All crawls completed:', contents.value); // 可在此处添加成功回调
			
 
				-	} catch (error) {
			
 
				-		console.error('爬取过程中发生错误:', error);
			
 
				-	}
			
 
				+<style scoped>
			
 
				+.iframe-container {
			
 
				+  position: relative;
			
 
				+  width: 100%;
			
 
				+  height: calc(100vh - 90px);
			
 
				 }
			
 
				-</script>
			
 
				+</style>