Bladeren bron

网页解析:添加转换为markdown格式

Xiao_123 2 weken geleden
bovenliggende
commit
270e4417aa
3 gewijzigde bestanden met toevoegingen van 83 en 0 verwijderingen
  1. 1 0
      package.json
  2. 15 0
      pnpm-lock.yaml
  3. 67 0
      src/views/menduner/system/talentMap/maintenance/gather/webPageParsing/index.vue

+ 1 - 0
package.json

@@ -63,6 +63,7 @@
     "qrcode": "^1.5.3",
     "qs": "^6.12.0",
     "steady-xml": "^0.1.0",
+    "turndown": "^7.2.0",
     "url": "^0.11.3",
     "video.js": "^7.21.5",
     "vue": "3.4.21",

+ 15 - 0
pnpm-lock.yaml

@@ -119,6 +119,9 @@ importers:
       steady-xml:
         specifier: ^0.1.0
         version: 0.1.0
+      turndown:
+        specifier: ^7.2.0
+        version: 7.2.0
       url:
         specifier: ^0.11.3
         version: 0.11.3
@@ -1373,6 +1376,9 @@ packages:
   '@mendable/firecrawl-js@1.19.1':
     resolution: {integrity: sha512-rtBnlF6oLJAxhH4YG8P72FernR1TvdI4J7uiSad2hOF7ZtbkzHkuNsa/10KUTPsmeQf8ESxiSQ1p7HMyEXuW9g==}
 
+  '@mixmark-io/domino@2.2.0':
+    resolution: {integrity: sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw==}
+
   '@nodelib/fs.scandir@2.1.5':
     resolution: {integrity: sha512-vq24Bq3ym5HEQm2NKCr3yXDwjc7vTsEThRDnkp2DK9p1uqLR+DHurm/NOTo0KG7HYHU7eppKZj3MyqYuMBf62g==}
     engines: {node: '>= 8'}
@@ -4864,6 +4870,9 @@ packages:
   tslib@2.6.2:
     resolution: {integrity: sha512-AEYxH93jGFPn/a2iVAwW87VuUIkR1FVUKB77NwMF7nBTDkDrrT/Hpt/IrCJ0QXhW27jTBDcf5ZY7w6RiqTMw2Q==}
 
+  turndown@7.2.0:
+    resolution: {integrity: sha512-eCZGBN4nNNqM9Owkv9HAtWRYfLA4h909E/WGAWWBpmB275ehNhZyk87/Tpvjbp0jjNl9XwCsbe6bm6CqFsgD+A==}
+
   type-check@0.4.0:
     resolution: {integrity: sha512-XleUoc9uwGXqjWwXaUTZAmzMcFZ5858QA2vvx1Ur5xIcixXIP+8LnFDgRplU30us6teqdlskFfu+ae4K79Ooew==}
     engines: {node: '>= 0.8.0'}
@@ -6538,6 +6547,8 @@ snapshots:
       - debug
       - ws
 
+  '@mixmark-io/domino@2.2.0': {}
+
   '@nodelib/fs.scandir@2.1.5':
     dependencies:
       '@nodelib/fs.stat': 2.0.5
@@ -10413,6 +10424,10 @@ snapshots:
 
   tslib@2.6.2: {}
 
+  turndown@7.2.0:
+    dependencies:
+      '@mixmark-io/domino': 2.2.0
+
   type-check@0.4.0:
     dependencies:
       prelude-ls: 1.2.1

+ 67 - 0
src/views/menduner/system/talentMap/maintenance/gather/webPageParsing/index.vue

@@ -37,6 +37,7 @@
           <div class="overflow-y-auto !h-360px" v-if="content.data">
             <pre>{{ content.data }}</pre>
           </div>
+					<el-button type="primary" plain class="mt-10px" @click="handleConvert(content)">转换为markdown格式</el-button>
 				</el-card>
 			</el-col>
 		</el-row>
@@ -54,12 +55,26 @@
 			<el-button type="primary" class="!w-100px" @click="drawer = false">关 闭</el-button>
 		</div>
 	</el-drawer>
+
+	<el-drawer
+		v-model="showMarkDown"
+		class="!w-50vw"
+		:with-header="false"
+		:modal="true"
+	>
+		<pre>{{ markdownContent }}</pre>
+		<el-divider class="!ma-0" />
+		<div class="position-sticky left-20px !h-50px lh-50px">
+			<el-button type="primary" class="!w-100px" @click="showMarkDown = false; markdownContent = ''">关 闭</el-button>
+		</div>
+	</el-drawer>
 </template>
 
 <script setup>
 /** 人才采集 网页解析 */
 defineOptions({ name: 'WebPageParsing' })
 import axios from 'axios'
+import TurndownService from 'turndown'
 
 const message = useMessage() // 消息弹窗
 const { t } = useI18n() // 国际化
@@ -72,6 +87,58 @@ const queryFormRef = ref()
 const contents = ref([])
 const drawer = ref(false)
 
+// 创建转换服务
+const turndownService = new TurndownService({
+  headingStyle: 'atx',
+  codeBlockStyle: 'fenced',
+  bulletListMarker: '-'
+})
+
+// 添加自定义规则处理微信公众号特有内容
+turndownService.addRule('wechatImages', {
+  filter: 'img',
+  replacement: (content, node) => {
+    const alt = node.getAttribute('alt') || ''
+    const src = node.getAttribute('src') || node.getAttribute('data-src') || ''
+    return `![${alt}](${src})`
+  }
+})
+
+// 提取主要内容并转换为Markdown
+const wechatHtmlToMarkdown = (html) => {
+  // 创建一个临时DOM解析器
+  const parser = new DOMParser()
+  const doc = parser.parseFromString(html, 'text/html')
+  
+  // 提取正文内容 - 微信公众号文章通常在id="js_content"的div中
+  const content = doc.querySelector('#js_content') || doc.body
+  
+  // 移除不需要的元素
+  const elementsToRemove = [
+    'script', 'style', 'iframe', 'button', 
+    '.qr_code', '.rich_media_extra', '.copyright'
+  ]
+  
+  elementsToRemove.forEach(selector => {
+    content.querySelectorAll(selector).forEach(el => el.remove())
+  })
+  
+  // 转换为Markdown
+  return turndownService.turndown(content.innerHTML)
+}
+
+// 转换为markdown格式
+const showMarkDown = ref(false)
+const markdownContent = ref(null)
+const handleConvert = (res) => {
+	if (!res.data) return
+	const result = wechatHtmlToMarkdown(res.data)
+	// console.log(result, 'markdown格式')
+	if (!result) return message.warning('转换失败')
+	markdownContent.value = result
+	showMarkDown.value = true
+}
+
 // 查看原网页
 const showPage = (res) => {
   if (res.data) {