Ver Fonte

网页解析:使用代理获取html内容

Xiao_123 há 1 dia atrás
pai
commit
d233ac6e0c

+ 168 - 0
src/views/menduner/system/talentMap/maintenance/gather/webPageParsing/index copy.vue

@@ -0,0 +1,168 @@
+<template>
+	<ContentWrap>
+		<el-form
+			class="-mb-15px"
+			:model="queryParams"
+			ref="queryFormRef"
+			:inline="true"
+			label-width="90px"
+		>
+			<el-form-item label="url抓取数据" prop="urls">
+				<el-input
+					v-model="queryParams.urls"
+					class="!w-60vw"
+					type="textarea"
+					:rows="1"
+					placeholder="请输入需要爬取的页面,多个页面请用 ',' 隔开"
+				/>
+			</el-form-item>
+			<el-form-item>
+				<el-button type="primary" plain :loading="loading" @click="handleExecute">执行</el-button>
+			</el-form-item>
+		</el-form>
+	</ContentWrap>
+
+	<ContentWrap v-if="contents.length">
+		<el-row gutter="20">
+			<el-col v-for="(content, index) in contents" :key="index" :span="12">
+				<el-card class="!h-500px" v-loading="!content.data">
+					<template #header>
+						<div class="flex items-center justify-between">
+							<el-text class="flex-1" truncated>{{ content.url }}</el-text>
+							<div class="!w-85px">
+								<Icon icon="ep:view" size="25" class="ml-10px cursor-pointer" color="#409eff" @click="showPage(content)" />
+								<Icon icon="ep:refresh" size="25" class=" ml-18px cursor-pointer" color="#409eff" @click="handleReload(content)" />
+							</div>
+						</div>
+					</template>
+					<div v-if="content.data">
+						<template v-if="typeof content.data === 'string'">{{ content.data }}</template>
+            <el-tabs v-else v-model="content.tab">
+              <el-tab-pane v-for="(v, k) in content.data.data[0]" :key="k" :label="k" :name="k" class="overflow-y-auto !h-360px">
+								<template v-if="k === 'html'">
+									<div class="position-sticky float-right">
+										<el-button
+											type="primary"
+											class="cursor-pointer"
+											@click="content.showHtml = !content.showHtml"
+											:icon="SetUp"
+											circle
+										/>
+									</div>
+                  <pre v-if="!content.showHtml">{{ v }}</pre>
+                  <div v-else v-html="v"></div>
+                </template>
+                <pre v-else>{{ v || '暂无数据' }}</pre>
+							</el-tab-pane>
+            </el-tabs>
+          </div>
+				</el-card>
+			</el-col>
+		</el-row>
+	</ContentWrap>
+
+	<el-drawer
+		v-model="drawer"
+		class="!w-50vw"
+		:with-header="false"
+		:modal="true"
+	>
+		<iframe class="!w-100% !h-[calc(100vh-90px)]" :src="drawerUrl" frameborder="0"></iframe>
+		<el-divider class="!ma-0" />
+		<div class="position-sticky left-20px !h-50px lh-50px">
+			<el-button type="primary" class="!w-100px" @click="drawer = false; drawerUrl = ''">关 闭</el-button>
+		</div>
+	</el-drawer>
+</template>
+
+<script setup>
+/** 人才采集 网页解析 */
+defineOptions({ name: 'WebPageParsing' })
+import FirecrawlApp from '@mendable/firecrawl-js'
+import { SetUp } from '@element-plus/icons-vue'
+
+const message = useMessage() // 消息弹窗
+const { t } = useI18n() // 国际化
+
+const loading = ref(false)
+const queryParams = reactive({
+	// urls: 'https://element.eleme.cn/#/zh-CN/component/installation'
+	urls: 'https://mp.weixin.qq.com/s/WeCRR3zN3fPvlGR4t8YFDA'
+})
+const queryFormRef = ref()
+const contents = ref([])
+const drawer = ref(false)
+const drawerUrl = ref('')
+
+const showPage = (content) => {
+	drawer.value = true
+	drawerUrl.value = content.url
+}
+
+const handleReload = async (content) => {
+	content.data = null
+	const res = await handleData(queryParams.urls)
+	content.tab = 0
+	content.data = res
+}
+
+const handleData = async (url) => {
+	try {
+    const app = new FirecrawlApp({ apiKey: 'fc-85c1550c6db64ce4ae8f2d2cd2606e6f' })
+    const crawlResponse = await app.crawlUrl(url, {
+      limit: 100,
+      scrapeOptions: {
+        formats: ['markdown', 'html']
+      }
+    })
+    if (!crawlResponse.success) {
+      throw new Error(`Failed to crawl: ${crawlResponse.error}`)
+    }
+    return crawlResponse
+  } catch (error) {
+    return error.message
+  }
+}
+
+// const handleExecute = async () => {
+// 	if (!queryParams.urls) return
+// 	contents.value = []
+// 	const urls = queryParams.urls.split(',')
+
+// 	const run = async (url) => {
+// 	  contents.value.push({ url, tab: 'markdown', showHtml: false, data: null })
+// 		const res = await handleData(url)
+// 		contents.value[contents.value.length - 1] = { url, tab: 'markdown', showHtml: false, data: res }
+
+// 		if (contents.value.length < urls.length) {
+// 			await run(urls[contents.value.length])
+// 		}
+// 	}
+
+// 	await run(urls[contents.value.length])
+// }
+
+// 执行
+const handleExecute = async () => {
+	if (!queryParams.urls) return
+	contents.value = []
+	const urls = queryParams.urls.split(',').map(url => url.trim()).filter(url => url)
+	if (urls.length === 0) return
+
+	urls.forEach(url => {
+	  contents.value.push({ url, tab: 'markdown', showHtml: false, data: null })
+	})
+
+	const crawlPromises = urls.map(async (url, index) => {
+		const res = await handleData(url)
+		contents.value[index] = { ...contents.value[index], data: res }
+	})
+
+	try {
+		await Promise.all(crawlPromises)
+		console.log('All crawls completed:', contents.value); // 可在此处添加成功回调
+	} catch (error) {
+		console.error('爬取过程中发生错误:', error);
+	}
+}
+</script>

+ 70 - 93
src/views/menduner/system/talentMap/maintenance/gather/webPageParsing/index.vue

@@ -7,13 +7,13 @@
 			:inline="true"
 			label-width="90px"
 		>
-			<el-form-item label="url抓取数据" prop="urls">
+			<el-form-item label="url提取内容" prop="urls">
 				<el-input
 					v-model="queryParams.urls"
 					class="!w-60vw"
 					type="textarea"
-					:rows="1"
-					placeholder="请输入需要取的页面,多个页面请用 ',' 隔开"
+					:rows="2"
+					placeholder="请输入需要取的页面,多个页面请用 ',' 隔开"
 				/>
 			</el-form-item>
 			<el-form-item>
@@ -29,32 +29,13 @@
 					<template #header>
 						<div class="flex items-center justify-between">
 							<el-text class="flex-1" truncated>{{ content.url }}</el-text>
-							<div class="!w-85px">
+							<div class="!w-40px">
 								<Icon icon="ep:view" size="25" class="ml-10px cursor-pointer" color="#409eff" @click="showPage(content)" />
-								<Icon icon="ep:refresh" size="25" class=" ml-18px cursor-pointer" color="#409eff" @click="handleReload(content)" />
 							</div>
 						</div>
 					</template>
-					<div v-if="content.data">
-						<template v-if="typeof content.data === 'string'">{{ content.data }}</template>
-            <el-tabs v-else v-model="content.tab">
-              <el-tab-pane v-for="(v, k) in content.data.data[0]" :key="k" :label="k" :name="k" class="overflow-y-auto !h-360px">
-								<template v-if="k === 'html'">
-									<div class="position-sticky float-right">
-										<el-button
-											type="primary"
-											class="cursor-pointer"
-											@click="content.showHtml = !content.showHtml"
-											:icon="SetUp"
-											circle
-										/>
-									</div>
-                  <pre v-if="!content.showHtml">{{ v }}</pre>
-                  <div v-else v-html="v"></div>
-                </template>
-                <pre v-else>{{ v || '暂无数据' }}</pre>
-							</el-tab-pane>
-            </el-tabs>
+          <div class="overflow-y-auto !h-360px" v-if="content.data">
+            <pre>{{ content.data }}</pre>
           </div>
 				</el-card>
 			</el-col>
@@ -67,10 +48,10 @@
 		:with-header="false"
 		:modal="true"
 	>
-		<iframe class="!w-100% !h-[calc(100vh-90px)]" :src="drawerUrl" frameborder="0"></iframe>
+		<iframe id="iFrame" class="!w-100% !h-[calc(100vh-90px)]" src="" frameborder="0"></iframe>
 		<el-divider class="!ma-0" />
 		<div class="position-sticky left-20px !h-50px lh-50px">
-			<el-button type="primary" class="!w-100px" @click="drawer = false; drawerUrl = ''">关 闭</el-button>
+			<el-button type="primary" class="!w-100px" @click="drawer = false">关 闭</el-button>
 		</div>
 	</el-drawer>
 </template>
@@ -78,91 +59,87 @@
 <script setup>
 /** 人才采集 网页解析 */
 defineOptions({ name: 'WebPageParsing' })
-import FirecrawlApp from '@mendable/firecrawl-js'
-import { SetUp } from '@element-plus/icons-vue'
+import axios from 'axios'
 
 const message = useMessage() // 消息弹窗
 const { t } = useI18n() // 国际化
 
 const loading = ref(false)
 const queryParams = reactive({
-	// urls: 'https://element.eleme.cn/#/zh-CN/component/installation'
-	urls: 'https://mp.weixin.qq.com/s/WeCRR3zN3fPvlGR4t8YFDA'
+	urls: 'https://mp.weixin.qq.com/s/vQLWlSB6DzqSewtBLkk_kQ'
 })
 const queryFormRef = ref()
 const contents = ref([])
 const drawer = ref(false)
-const drawerUrl = ref('')
 
-const showPage = (content) => {
-	drawer.value = true
-	drawerUrl.value = content.url
-}
-
-const handleReload = async (content) => {
-	content.data = null
-	const res = await handleData(queryParams.urls)
-	content.tab = 0
-	content.data = res
-}
-
-const handleData = async (url) => {
-	try {
-    const app = new FirecrawlApp({ apiKey: 'fc-85c1550c6db64ce4ae8f2d2cd2606e6f' })
-    const crawlResponse = await app.crawlUrl(url, {
-      limit: 100,
-      scrapeOptions: {
-        formats: ['markdown', 'html']
-      }
+// 查看原网页
+const showPage = (res) => {
+  if (res.data) {
+    drawer.value = true
+    let html = res.data
+    html = html.replace(/data-src/g, 'src') // 将 data-src 转化为 src
+      .replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/g, '') // 移除HTML内容中所有的<script>标签,这样可以避免在iframe中执行潜在的不受信任的脚本。
+      .replace(/https/g, 'http') // 将HTML内容中所有的https替换为http,可能是为了避免在HTTPS环境下加载非HTTPS资源导致浏览器警告
+    
+    nextTick(() => {
+      const iframe = document.getElementById('iFrame')
+      if (!iframe) return
+      const doc = iframe.contentDocument || iframe.document
+      // 设置 iframe 中请求不发送 referrer,以绕过图片防盗链
+      const htmlArr = html.split('</head>')
+      const html_src_add = htmlArr[0] + '<meta name="referrer" content="never"></head>' + htmlArr[1]
+      doc.open()
+      doc.write(html_src_add)
+      doc.close()
+      // 通过延时获取文档高度赋值Iframe去除滚动条,根据实际情况增加延时时间
+      setTimeout(() => {
+        const jsContent = doc.getElementById('js_content')
+        if (jsContent) {
+          jsContent.style.visibility = 'visible'
+          jsContent.style.opacity = 1
+        }
+      }, 100)
     })
-    if (!crawlResponse.success) {
-      throw new Error(`Failed to crawl: ${crawlResponse.error}`)
-    }
-    return crawlResponse
-  } catch (error) {
-    return error.message
   }
 }
 
-// const handleExecute = async () => {
-// 	if (!queryParams.urls) return
-// 	contents.value = []
-// 	const urls = queryParams.urls.split(',')
-
-// 	const run = async (url) => {
-// 	  contents.value.push({ url, tab: 'markdown', showHtml: false, data: null })
-// 		const res = await handleData(url)
-// 		contents.value[contents.value.length - 1] = { url, tab: 'markdown', showHtml: false, data: res }
-
-// 		if (contents.value.length < urls.length) {
-// 			await run(urls[contents.value.length])
-// 		}
-// 	}
-
-// 	await run(urls[contents.value.length])
-// }
-
 // 执行
 const handleExecute = async () => {
 	if (!queryParams.urls) return
 	contents.value = []
-	const urls = queryParams.urls.split(',').map(url => url.trim()).filter(url => url)
-	if (urls.length === 0) return
-
-	urls.forEach(url => {
-	  contents.value.push({ url, tab: 'markdown', showHtml: false, data: null })
-	})
-
-	const crawlPromises = urls.map(async (url, index) => {
-		const res = await handleData(url)
-		contents.value[index] = { ...contents.value[index], data: res }
-	})
-
+	loading.value = true
 	try {
-		await Promise.all(crawlPromises)
-		console.log('All crawls completed:', contents.value); // 可在此处添加成功回调
-	} catch (error) {
-		console.error('爬取过程中发生错误:', error);
+		const urlArr = queryParams.urls.split(',').map(url => url.trim()).filter(Boolean)
+		const isWeChatUrl = urlArr.every(url => url.includes('https://mp.weixin.qq.com'))
+		if (!isWeChatUrl) {
+			message.warning('请输入微信公众文章链接')
+			return
+		}
+
+		const requests = urlArr.map(url => {
+			const path = url.split('mp.weixin.qq.com')[1]
+			const realurl = `/weixin-article${path}`
+			return axios.get(realurl).then(res => ({
+				url: url,
+				data: res.data
+			}))
+		})
+
+		const results = await Promise.all(requests)
+		contents.value = results.filter(item => item.data)
+	} catch (e) {
+		console.error(e)
+		message.error('获取失败')
+	} finally {
+		loading.value = false
 	}
 }
-</script>
+</script>
+
+<style scoped>
+.iframe-container {
+  position: relative;
+  width: 100%;
+  height: calc(100vh - 90px);
+}
+</style>

+ 6 - 0
vite.config.ts

@@ -45,6 +45,12 @@ export default ({ command, mode }: ConfigEnv): UserConfig => {
           secure: false, // 是否支持 https,默认 false
           changeOrigin: true, // 是否支持跨域
           rewrite: (path) => path.replace(new RegExp(`^/api`), '/api')
+        },
+        '/weixin-article': {
+          target: 'https://mp.weixin.qq.com/',
+          secure: true,
+          changeOrigin: true,
+          rewrite: (path) => path.replace(new RegExp(`^/weixin-article`), '')
         }
       }
     },