|
@@ -7,13 +7,13 @@
|
|
:inline="true"
|
|
:inline="true"
|
|
label-width="90px"
|
|
label-width="90px"
|
|
>
|
|
>
|
|
- <el-form-item label="url抓取数据" prop="urls">
|
|
|
|
|
|
+ <el-form-item label="url提取内容" prop="urls">
|
|
<el-input
|
|
<el-input
|
|
v-model="queryParams.urls"
|
|
v-model="queryParams.urls"
|
|
class="!w-60vw"
|
|
class="!w-60vw"
|
|
type="textarea"
|
|
type="textarea"
|
|
- :rows="1"
|
|
|
|
- placeholder="请输入需要爬取的页面,多个页面请用 ',' 隔开"
|
|
|
|
|
|
+ :rows="2"
|
|
|
|
+ placeholder="请输入需要提取的页面,多个页面请用 ',' 隔开"
|
|
/>
|
|
/>
|
|
</el-form-item>
|
|
</el-form-item>
|
|
<el-form-item>
|
|
<el-form-item>
|
|
@@ -29,32 +29,13 @@
|
|
<template #header>
|
|
<template #header>
|
|
<div class="flex items-center justify-between">
|
|
<div class="flex items-center justify-between">
|
|
<el-text class="flex-1" truncated>{{ content.url }}</el-text>
|
|
<el-text class="flex-1" truncated>{{ content.url }}</el-text>
|
|
- <div class="!w-85px">
|
|
|
|
|
|
+ <div class="!w-40px">
|
|
<Icon icon="ep:view" size="25" class="ml-10px cursor-pointer" color="#409eff" @click="showPage(content)" />
|
|
<Icon icon="ep:view" size="25" class="ml-10px cursor-pointer" color="#409eff" @click="showPage(content)" />
|
|
- <Icon icon="ep:refresh" size="25" class=" ml-18px cursor-pointer" color="#409eff" @click="handleReload(content)" />
|
|
|
|
</div>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
</template>
|
|
</template>
|
|
- <div v-if="content.data">
|
|
|
|
- <template v-if="typeof content.data === 'string'">{{ content.data }}</template>
|
|
|
|
- <el-tabs v-else v-model="content.tab">
|
|
|
|
- <el-tab-pane v-for="(v, k) in content.data.data[0]" :key="k" :label="k" :name="k" class="overflow-y-auto !h-360px">
|
|
|
|
- <template v-if="k === 'html'">
|
|
|
|
- <div class="position-sticky float-right">
|
|
|
|
- <el-button
|
|
|
|
- type="primary"
|
|
|
|
- class="cursor-pointer"
|
|
|
|
- @click="content.showHtml = !content.showHtml"
|
|
|
|
- :icon="SetUp"
|
|
|
|
- circle
|
|
|
|
- />
|
|
|
|
- </div>
|
|
|
|
- <pre v-if="!content.showHtml">{{ v }}</pre>
|
|
|
|
- <div v-else v-html="v"></div>
|
|
|
|
- </template>
|
|
|
|
- <pre v-else>{{ v || '暂无数据' }}</pre>
|
|
|
|
- </el-tab-pane>
|
|
|
|
- </el-tabs>
|
|
|
|
|
|
+ <div class="overflow-y-auto !h-360px" v-if="content.data">
|
|
|
|
+ <pre>{{ content.data }}</pre>
|
|
</div>
|
|
</div>
|
|
</el-card>
|
|
</el-card>
|
|
</el-col>
|
|
</el-col>
|
|
@@ -67,10 +48,10 @@
|
|
:with-header="false"
|
|
:with-header="false"
|
|
:modal="true"
|
|
:modal="true"
|
|
>
|
|
>
|
|
- <iframe class="!w-100% !h-[calc(100vh-90px)]" :src="drawerUrl" frameborder="0"></iframe>
|
|
|
|
|
|
+ <iframe id="iFrame" class="!w-100% !h-[calc(100vh-90px)]" src="" frameborder="0"></iframe>
|
|
<el-divider class="!ma-0" />
|
|
<el-divider class="!ma-0" />
|
|
<div class="position-sticky left-20px !h-50px lh-50px">
|
|
<div class="position-sticky left-20px !h-50px lh-50px">
|
|
- <el-button type="primary" class="!w-100px" @click="drawer = false; drawerUrl = ''">关 闭</el-button>
|
|
|
|
|
|
+ <el-button type="primary" class="!w-100px" @click="drawer = false">关 闭</el-button>
|
|
</div>
|
|
</div>
|
|
</el-drawer>
|
|
</el-drawer>
|
|
</template>
|
|
</template>
|
|
@@ -78,91 +59,87 @@
|
|
<script setup>
|
|
<script setup>
|
|
/** 人才采集 网页解析 */
|
|
/** 人才采集 网页解析 */
|
|
defineOptions({ name: 'WebPageParsing' })
|
|
defineOptions({ name: 'WebPageParsing' })
|
|
-import FirecrawlApp from '@mendable/firecrawl-js'
|
|
|
|
-import { SetUp } from '@element-plus/icons-vue'
|
|
|
|
|
|
+import axios from 'axios'
|
|
|
|
|
|
const message = useMessage() // 消息弹窗
|
|
const message = useMessage() // 消息弹窗
|
|
const { t } = useI18n() // 国际化
|
|
const { t } = useI18n() // 国际化
|
|
|
|
|
|
const loading = ref(false)
|
|
const loading = ref(false)
|
|
const queryParams = reactive({
|
|
const queryParams = reactive({
|
|
- // urls: 'https://element.eleme.cn/#/zh-CN/component/installation'
|
|
|
|
- urls: 'https://mp.weixin.qq.com/s/WeCRR3zN3fPvlGR4t8YFDA'
|
|
|
|
|
|
+ urls: 'https://mp.weixin.qq.com/s/vQLWlSB6DzqSewtBLkk_kQ'
|
|
})
|
|
})
|
|
const queryFormRef = ref()
|
|
const queryFormRef = ref()
|
|
const contents = ref([])
|
|
const contents = ref([])
|
|
const drawer = ref(false)
|
|
const drawer = ref(false)
|
|
-const drawerUrl = ref('')
|
|
|
|
|
|
|
|
-const showPage = (content) => {
|
|
|
|
- drawer.value = true
|
|
|
|
- drawerUrl.value = content.url
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-const handleReload = async (content) => {
|
|
|
|
- content.data = null
|
|
|
|
- const res = await handleData(queryParams.urls)
|
|
|
|
- content.tab = 0
|
|
|
|
- content.data = res
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-const handleData = async (url) => {
|
|
|
|
- try {
|
|
|
|
- const app = new FirecrawlApp({ apiKey: 'fc-85c1550c6db64ce4ae8f2d2cd2606e6f' })
|
|
|
|
- const crawlResponse = await app.crawlUrl(url, {
|
|
|
|
- limit: 100,
|
|
|
|
- scrapeOptions: {
|
|
|
|
- formats: ['markdown', 'html']
|
|
|
|
- }
|
|
|
|
|
|
+// 查看原网页
|
|
|
|
+const showPage = (res) => {
|
|
|
|
+ if (res.data) {
|
|
|
|
+ drawer.value = true
|
|
|
|
+ let html = res.data
|
|
|
|
+ html = html.replace(/data-src/g, 'src') // 将 data-src 转化为 src
|
|
|
|
+ .replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/g, '') // 移除HTML内容中所有的<script>标签,这样可以避免在iframe中执行潜在的不受信任的脚本。
|
|
|
|
+ .replace(/https/g, 'http') // 将HTML内容中所有的https替换为http,可能是为了避免在HTTPS环境下加载非HTTPS资源导致浏览器警告
|
|
|
|
+
|
|
|
|
+ nextTick(() => {
|
|
|
|
+ const iframe = document.getElementById('iFrame')
|
|
|
|
+ if (!iframe) return
|
|
|
|
+ const doc = iframe.contentDocument || iframe.document
|
|
|
|
+ // 设置 iframe 中请求不发送 referrer,以绕过图片防盗链
|
|
|
|
+ const htmlArr = html.split('</head>')
|
|
|
|
+ const html_src_add = htmlArr[0] + '<meta name="referrer" content="never"></head>' + htmlArr[1]
|
|
|
|
+ doc.open()
|
|
|
|
+ doc.write(html_src_add)
|
|
|
|
+ doc.close()
|
|
|
|
+ // 通过延时获取文档高度赋值Iframe去除滚动条,根据实际情况增加延时时间
|
|
|
|
+ setTimeout(() => {
|
|
|
|
+ const jsContent = doc.getElementById('js_content')
|
|
|
|
+ if (jsContent) {
|
|
|
|
+ jsContent.style.visibility = 'visible'
|
|
|
|
+ jsContent.style.opacity = 1
|
|
|
|
+ }
|
|
|
|
+ }, 100)
|
|
})
|
|
})
|
|
- if (!crawlResponse.success) {
|
|
|
|
- throw new Error(`Failed to crawl: ${crawlResponse.error}`)
|
|
|
|
- }
|
|
|
|
- return crawlResponse
|
|
|
|
- } catch (error) {
|
|
|
|
- return error.message
|
|
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
-// const handleExecute = async () => {
|
|
|
|
-// if (!queryParams.urls) return
|
|
|
|
-// contents.value = []
|
|
|
|
-// const urls = queryParams.urls.split(',')
|
|
|
|
-
|
|
|
|
-// const run = async (url) => {
|
|
|
|
-// contents.value.push({ url, tab: 'markdown', showHtml: false, data: null })
|
|
|
|
-// const res = await handleData(url)
|
|
|
|
-// contents.value[contents.value.length - 1] = { url, tab: 'markdown', showHtml: false, data: res }
|
|
|
|
-
|
|
|
|
-// if (contents.value.length < urls.length) {
|
|
|
|
-// await run(urls[contents.value.length])
|
|
|
|
-// }
|
|
|
|
-// }
|
|
|
|
-
|
|
|
|
-// await run(urls[contents.value.length])
|
|
|
|
-// }
|
|
|
|
-
|
|
|
|
// 执行
|
|
// 执行
|
|
const handleExecute = async () => {
|
|
const handleExecute = async () => {
|
|
if (!queryParams.urls) return
|
|
if (!queryParams.urls) return
|
|
contents.value = []
|
|
contents.value = []
|
|
- const urls = queryParams.urls.split(',').map(url => url.trim()).filter(url => url)
|
|
|
|
- if (urls.length === 0) return
|
|
|
|
-
|
|
|
|
- urls.forEach(url => {
|
|
|
|
- contents.value.push({ url, tab: 'markdown', showHtml: false, data: null })
|
|
|
|
- })
|
|
|
|
-
|
|
|
|
- const crawlPromises = urls.map(async (url, index) => {
|
|
|
|
- const res = await handleData(url)
|
|
|
|
- contents.value[index] = { ...contents.value[index], data: res }
|
|
|
|
- })
|
|
|
|
-
|
|
|
|
|
|
+ loading.value = true
|
|
try {
|
|
try {
|
|
- await Promise.all(crawlPromises)
|
|
|
|
- console.log('All crawls completed:', contents.value); // 可在此处添加成功回调
|
|
|
|
- } catch (error) {
|
|
|
|
- console.error('爬取过程中发生错误:', error);
|
|
|
|
|
|
+ const urlArr = queryParams.urls.split(',').map(url => url.trim()).filter(Boolean)
|
|
|
|
+ const isWeChatUrl = urlArr.every(url => url.includes('https://mp.weixin.qq.com'))
|
|
|
|
+ if (!isWeChatUrl) {
|
|
|
|
+ message.warning('请输入微信公众文章链接')
|
|
|
|
+ return
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ const requests = urlArr.map(url => {
|
|
|
|
+ const path = url.split('mp.weixin.qq.com')[1]
|
|
|
|
+ const realurl = `/weixin-article${path}`
|
|
|
|
+ return axios.get(realurl).then(res => ({
|
|
|
|
+ url: url,
|
|
|
|
+ data: res.data
|
|
|
|
+ }))
|
|
|
|
+ })
|
|
|
|
+
|
|
|
|
+ const results = await Promise.all(requests)
|
|
|
|
+ contents.value = results.filter(item => item.data)
|
|
|
|
+ } catch (e) {
|
|
|
|
+ console.error(e)
|
|
|
|
+ message.error('获取失败')
|
|
|
|
+ } finally {
|
|
|
|
+ loading.value = false
|
|
}
|
|
}
|
|
}
|
|
}
|
|
-</script>
|
|
|
|
|
|
+</script>
|
|
|
|
+
|
|
|
|
+<style scoped>
|
|
|
|
+.iframe-container {
|
|
|
|
+ position: relative;
|
|
|
|
+ width: 100%;
|
|
|
|
+ height: calc(100vh - 90px);
|
|
|
|
+}
|
|
|
|
+</style>
|