Browse Source

更新网页解析

Xiao_123 3 tháng trước cách đây
mục cha
commit
e7288b026e

+ 135 - 77
src/views/menduner/system/talentMap/maintenance/gather/components/webAnalysis.vue

@@ -5,57 +5,39 @@
 			:model="queryParams"
 			ref="queryFormRef"
 			:inline="true"
-			label-width="90px"
+			label-position="top"
+			label-width="110px"
 		>
-			<el-form-item label="url抓取数据" prop="urls">
+			<el-form-item label="微信公众号链接" prop="urls" class="!w-100%">
 				<el-input
 					v-model="queryParams.urls"
-					class="!w-420px"
+					class="!w-80%"
 					type="textarea"
-					:rows="1"
-					placeholder="请输入需要爬取的页面,多个页面请用 ',' 隔开"
+					:rows="2"
+					placeholder="请输入需要解析的页面"
 				/>
-			</el-form-item>
-			<el-form-item>
-				<el-button type="primary" plain :loading="loading" @click="handleExecute">执行</el-button>
+				<el-button type="primary" class="ml-10px" plain :loading="loading" @click="handleAnalysis">解析</el-button>
 			</el-form-item>
 		</el-form>
 	</ContentWrap>
 
 	<ContentWrap v-if="contents.length">
 		<el-row gutter="20">
-			<el-col v-for="(content, index) in contents" :key="index" :span="12">
+			<el-col v-for="(content, index) in contents" :key="index" :span="24">
 				<el-card class="!h-500px" v-loading="!content.data">
 					<template #header>
 						<div class="flex items-center justify-between">
 							<el-text class="flex-1" truncated>{{ content.url }}</el-text>
-							<div class="!w-85px">
+							<!-- <div class="!w-40px">
 								<Icon icon="ep:view" size="25" class="ml-10px cursor-pointer" color="#409eff" @click="showPage(content)" />
-								<Icon icon="ep:refresh" size="25" class=" ml-18px cursor-pointer" color="#409eff" @click="handleReload(content)" />
-							</div>
+							</div> -->
 						</div>
 					</template>
-					<div v-if="content.data">
-						<template v-if="typeof content.data === 'string'">{{ content.data }}</template>
-            <el-tabs v-else v-model="content.tab">
-              <el-tab-pane v-for="(v, k) in content.data.data[0]" :key="k" :label="k" :name="k" class="overflow-y-auto !h-360px">
-								<template v-if="k === 'html'">
-									<div class="position-sticky float-right">
-										<el-button
-											type="primary"
-											class="cursor-pointer"
-											@click="content.showHtml = !content.showHtml"
-											:icon="SetUp"
-											circle
-										/>
-									</div>
-                  <pre v-if="!content.showHtml">{{ v }}</pre>
-                  <div v-else v-html="v"></div>
-                </template>
-                <pre v-else>{{ v || '暂无数据' }}</pre>
-							</el-tab-pane>
-            </el-tabs>
+          <div class="overflow-y-auto !h-360px" v-if="content.data">
+            <pre>{{ content.data }}</pre>
           </div>
+					<el-button type="primary" plain class="mt-10px" @click="showPage(content)">预览</el-button>
+					<el-button type="primary" plain class="mt-10px" @click="handleConvert(content)">转换为markdown格式</el-button>
 				</el-card>
 			</el-col>
 		</el-row>
@@ -67,82 +49,158 @@
 		:with-header="false"
 		:modal="true"
 	>
-		<iframe class="!w-100% !h-[calc(100vh-90px)]" :src="drawerUrl" frameborder="0"></iframe>
+		<iframe id="iFrame" class="!w-100% !h-[calc(100vh-90px)]" src="" frameborder="0"></iframe>
+		<el-divider class="!ma-0" />
+		<div class="position-sticky left-20px !h-50px lh-50px">
+			<el-button type="primary" class="!w-100px" @click="drawer = false">关 闭</el-button>
+		</div>
+	</el-drawer>
+
+	<el-drawer
+		v-model="showMarkDown"
+		class="!w-50vw"
+		:with-header="false"
+		:modal="true"
+	>
+		<pre>{{ markdownContent }}</pre>
 		<el-divider class="!ma-0" />
 		<div class="position-sticky left-20px !h-50px lh-50px">
-			<el-button type="primary" class="!w-100px" @click="drawer = false; drawerUrl = ''">关 闭</el-button>
+			<el-button type="primary" class="!w-100px" @click="showMarkDown = false; markdownContent = ''">关 闭</el-button>
 		</div>
 	</el-drawer>
 </template>
 
 <script setup>
 /** 人才采集 网页解析 */
-import FirecrawlApp from '@mendable/firecrawl-js'
-import { SetUp } from '@element-plus/icons-vue'
+defineOptions({ name: 'WebPageParsing' })
+import axios from 'axios'
+import TurndownService from 'turndown'
 
 const message = useMessage() // 消息弹窗
 const { t } = useI18n() // 国际化
 
 const loading = ref(false)
 const queryParams = reactive({
-	urls: 'https://mp.weixin.qq.com/s/WeCRR3zN3fPvlGR4t8YFDA'
+	urls: 'https://mp.weixin.qq.com/s/vQLWlSB6DzqSewtBLkk_kQ'
 })
 const queryFormRef = ref()
 const contents = ref([])
 const drawer = ref(false)
-const drawerUrl = ref('')
 
-const showPage = (content) => {
-	drawer.value = true
-	drawerUrl.value = content.url
+// 创建转换服务
+const turndownService = new TurndownService({
+  headingStyle: 'atx',
+  codeBlockStyle: 'fenced',
+  bulletListMarker: '-'
+})
+
+// 添加自定义规则处理微信公众号特有内容
+turndownService.addRule('wechatImages', {
+  filter: 'img',
+  replacement: (content, node) => {
+    const alt = node.getAttribute('alt') || ''
+    const src = node.getAttribute('src') || node.getAttribute('data-src') || ''
+    return `![${alt}](${src})`
+  }
+})
+
+// 提取主要内容并转换为Markdown
+const wechatHtmlToMarkdown = (html) => {
+  // 创建一个临时DOM解析器
+  const parser = new DOMParser()
+  const doc = parser.parseFromString(html, 'text/html')
+  
+  // 提取正文内容 - 微信公众号文章通常在id="js_content"的div中
+  const content = doc.querySelector('#js_content') || doc.body
+  
+  // 移除不需要的元素
+  const elementsToRemove = [
+    'script', 'style', 'iframe', 'button', 
+    '.qr_code', '.rich_media_extra', '.copyright'
+  ]
+  
+  elementsToRemove.forEach(selector => {
+    content.querySelectorAll(selector).forEach(el => el.remove())
+  })
+  
+  // 转换为Markdown
+  return turndownService.turndown(content.innerHTML)
 }
 
-const handleReload = async (content) => {
-	content.data = null
-	const res = await handleData(queryParams.urls)
-	content.tab = 0
-	content.data = res
+// 转换为markdown格式
+const showMarkDown = ref(false)
+const markdownContent = ref(null)
+const handleConvert = (res) => {
+	if (!res.data) return
+	const result = wechatHtmlToMarkdown(res.data)
+	if (!result) return message.warning('转换失败')
+	markdownContent.value = result
+	showMarkDown.value = true
 }
 
-const handleData = async (url) => {
-	try {
-    const app = new FirecrawlApp({ apiKey: 'fc-85c1550c6db64ce4ae8f2d2cd2606e6f' })
-    const crawlResponse = await app.crawlUrl(url, {
-      limit: 100,
-      scrapeOptions: {
-        formats: ['markdown', 'html']
-      }
+// 查看原网页
+const showPage = (res) => {
+  if (res.data) {
+    drawer.value = true
+    let html = res.data
+    html = html.replace(/data-src/g, 'src') // 将 data-src 转化为 src
+      .replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/g, '') // 移除HTML内容中所有的<script>标签,这样可以避免在iframe中执行潜在的不受信任的脚本。
+      .replace(/https/g, 'http') // 将HTML内容中所有的https替换为http,可能是为了避免在HTTPS环境下加载非HTTPS资源导致浏览器警告
+    
+    nextTick(() => {
+      const iframe = document.getElementById('iFrame')
+      if (!iframe) return
+      const doc = iframe.contentDocument || iframe.document
+      // 设置 iframe 中请求不发送 referrer,以绕过图片防盗链
+      const htmlArr = html.split('</head>')
+      const html_src_add = htmlArr[0] + '<meta name="referrer" content="never"></head>' + htmlArr[1]
+      doc.open()
+      doc.write(html_src_add)
+      doc.close()
+      // 通过延时获取文档高度赋值Iframe去除滚动条,根据实际情况增加延时时间
+      setTimeout(() => {
+        const jsContent = doc.getElementById('js_content')
+        if (jsContent) {
+          jsContent.style.visibility = 'visible'
+          jsContent.style.opacity = 1
+        }
+      }, 100)
     })
-    if (!crawlResponse.success) {
-      throw new Error(`Failed to crawl: ${crawlResponse.error}`)
-    }
-    return crawlResponse
-  } catch (error) {
-    return error.message
   }
 }
 
-// 执行
-const handleExecute = async () => {
+// 解析
+const handleAnalysis = async () => {
 	if (!queryParams.urls) return
 	contents.value = []
-	const urls = queryParams.urls.split(',').map(url => url.trim()).filter(url => url)
-	if (urls.length === 0) return
+	loading.value = true
 
-	urls.forEach(url => {
-	  contents.value.push({ url, tab: 'markdown', showHtml: false, data: null })
-	})
+	const urlArr = queryParams.urls.split(',').map(url => url.trim()).filter(Boolean)
 
-	const crawlPromises = urls.map(async (url, index) => {
-		const res = await handleData(url)
-		contents.value[index] = { ...contents.value[index], data: res }
+	const isWeChatUrl = urlArr.every(url => url.includes('https://mp.weixin.qq.com'))
+	if (!isWeChatUrl) {
+		message.warning('请输入微信公众文章链接')
+		return
+	}
+
+	const base_url = import.meta.env.VITE_NODE_BASE_URL
+
+	axios.post(`${base_url}/process-urls`, { urlArr }, { timeout: 60000 }).then(res => {
+		console.log(res, '解析内容')
+		contents.value = res?.data?.contents ?? []
+	}).catch(err => {
+		console.log(err, '解析失败');
+		message.error(err.message)
+	}).finally(_ => {
+		loading.value = false
 	})
+}
+</script>
 
-	try {
-		await Promise.all(crawlPromises)
-		console.log('All crawls completed:', contents.value); // 可在此处添加成功回调
-	} catch (error) {
-		console.error('爬取过程中发生错误:', error);
-	}
+<style scoped>
+.iframe-container {
+  position: relative;
+  width: 100%;
+  height: calc(100vh - 90px);
 }
-</script>
+</style>