1 ngày trước cách đây · eee0f40283
--- a/app/api/data_parse/routes.py
+++ b/app/api/data_parse/routes.py
@@ -3,6 +3,8 @@ from app.api.data_parse import bp
 
				 from app.core.data_parse.parse import update_business_card, get_business_cards, update_business_card_status, create_talent_tag, get_talent_tag_list, update_talent_tag, delete_talent_tag, query_neo4j_graph, talent_get_tags, talent_update_tags, get_business_card, search_business_cards_by_mobile, get_hotel_positions_list, add_hotel_positions, update_hotel_positions, query_hotel_positions, delete_hotel_positions, get_hotel_group_brands_list, add_hotel_group_brands, update_hotel_group_brands, query_hotel_group_brands, delete_hotel_group_brands, get_duplicate_records, process_duplicate_record, get_duplicate_record_detail, fix_broken_duplicate_records
			
 
				 # 导入新的名片图片解析函数和添加名片函数
			
 
				 from app.core.data_parse.parse_card import process_business_card_image, add_business_card, delete_business_card
			
 
				+# 导入网页文本解析函数
			
 
				+from app.core.data_parse.parse_web import process_webpage_with_QWen
			
 
				 from app.config.config import DevelopmentConfig, ProductionConfig
			
 
				 import logging
			
 
				 import boto3
			
@@ -1472,3 +1474,113 @@ def fix_broken_duplicate_records_route():
 
				             'data': None
			
 
				         }), 500
			
 
				 
			
 
				+
			
 
				+# 网页文本解析接口
			
 
				+@bp.route('/webpage-parse', methods=['POST'])
			
 
				+def webpage_parse_route():
			
 
				+    """
			
 
				+    解析网页 Markdown 文本并提取人员信息的API接口
			
 
				+    
			
 
				+    请求参数:
			
 
				+        - markdown_text: 网页的 Markdown 格式文本内容 (JSON格式)
			
 
				+        
			
 
				+    请求体示例:
			
 
				+        {
			
 
				+            "markdown_text": "# 张三\n\n职位：高级经理\n\n公司：XX酒店\n\n![照片](http://example.com/photo.jpg)"
			
 
				+        }
			
 
				+        
			
 
				+    返回:
			
 
				+        - JSON: 包含提取的人员信息和处理状态
			
 
				+        
			
 
				+    功能说明:
			
 
				+        - 接收 Markdown 格式的网页文本
			
 
				+        - 进行必要的格式和内容验证
			
 
				+        - 使用 Qwen VL Max 模型提取人员信息
			
 
				+        - 支持提取照片链接 (pic_url)
			
 
				+        - 返回标准化的人员信息数据
			
 
				+        
			
 
				+    状态码:
			
 
				+        - 200: 解析成功
			
 
				+        - 400: 请求参数错误
			
 
				+        - 500: 解析失败
			
 
				+    """
			
 
				+    try:
			
 
				+        # 检查请求是否为 JSON 格式
			
 
				+        if not request.is_json:
			
 
				+            return jsonify({
			
 
				+                'success': False,
			
 
				+                'message': '请求必须是 JSON 格式',
			
 
				+                'data': None
			
 
				+            }), 400
			
 
				+        
			
 
				+        # 获取请求数据
			
 
				+        data = request.get_json()
			
 
				+        
			
 
				+        # 检查是否提供了 markdown_text 参数
			
 
				+        if 'markdown_text' not in data:
			
 
				+            return jsonify({
			
 
				+                'success': False,
			
 
				+                'message': '缺少必填参数: markdown_text',
			
 
				+                'data': None
			
 
				+            }), 400
			
 
				+        
			
 
				+        markdown_text = data['markdown_text']
			
 
				+        
			
 
				+        # 验证 markdown_text 是否为字符串
			
 
				+        if not isinstance(markdown_text, str):
			
 
				+            return jsonify({
			
 
				+                'success': False,
			
 
				+                'message': 'markdown_text 必须是字符串类型',
			
 
				+                'data': None
			
 
				+            }), 400
			
 
				+        
			
 
				+        # 验证文本内容不能为空
			
 
				+        if not markdown_text.strip():
			
 
				+            return jsonify({
			
 
				+                'success': False,
			
 
				+                'message': 'markdown_text 内容不能为空',
			
 
				+                'data': None
			
 
				+            }), 400
			
 
				+        
			
 
				+        # 验证文本长度（防止过长的文本）
			
 
				+        if len(markdown_text) > 50000:  # 限制最大50KB
			
 
				+            return jsonify({
			
 
				+                'success': False,
			
 
				+                'message': 'markdown_text 内容过长，最大支持50KB',
			
 
				+                'data': None
			
 
				+            }), 400
			
 
				+        
			
 
				+        # 基本的 Markdown 格式验证（可选）
			
 
				+        # 检查是否包含一些基本的文本内容
			
 
				+        if len(markdown_text.strip()) < 10:
			
 
				+            return jsonify({
			
 
				+                'success': False,
			
 
				+                'message': 'markdown_text 内容过短，无法进行有效解析',
			
 
				+                'data': None
			
 
				+            }), 400
			
 
				+        
			
 
				+        # 记录解析请求
			
 
				+        logger.info(f"开始解析网页文本，内容长度: {len(markdown_text)} 字符")
			
 
				+        
			
 
				+        # 调用网页文本解析函数
			
 
				+        extracted_data = process_webpage_with_QWen(markdown_text)
			
 
				+        
			
 
				+        # 返回成功结果
			
 
				+        return jsonify({
			
 
				+            'success': True,
			
 
				+            'message': '网页文本解析成功',
			
 
				+            'data': extracted_data
			
 
				+        }), 200
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        # 记录错误日志
			
 
				+        error_msg = f"网页文本解析失败: {str(e)}"
			
 
				+        logger.error(error_msg, exc_info=True)
			
 
				+        
			
 
				+        # 返回错误响应
			
 
				+        return jsonify({
			
 
				+            'success': False,
			
 
				+            'message': error_msg,
			
 
				+            'data': None
			
 
				+        }), 500
			
 
				+
			
--- a/app/core/data_parse/parse_web.py
+++ b/app/core/data_parse/parse_web.py
@@ -0,0 +1,192 @@
 
				+import os
			
 
				+import json
			
 
				+import logging
			
 
				+import re
			
 
				+from datetime import datetime
			
 
				+from openai import OpenAI
			
 
				+
			
 
				+
			
 
				+def extract_json_from_text(text):
			
 
				+    """
			
 
				+    从文本中提取JSON部分
			
 
				+    
			
 
				+    Args:
			
 
				+        text (str): 包含JSON的文本
			
 
				+        
			
 
				+    Returns:
			
 
				+        str: 提取的JSON字符串
			
 
				+    """
			
 
				+    # 尝试找到最外层的花括号对
			
 
				+    start_idx = text.find('{')
			
 
				+    if start_idx == -1:
			
 
				+        return "{}"
			
 
				+    
			
 
				+    # 使用简单的括号匹配算法找到对应的闭合括号
			
 
				+    count = 0
			
 
				+    for i in range(start_idx, len(text)):
			
 
				+        if text[i] == '{':
			
 
				+            count += 1
			
 
				+        elif text[i] == '}':
			
 
				+            count -= 1
			
 
				+            if count == 0:
			
 
				+                return text[start_idx:i+1]
			
 
				+    
			
 
				+    # 如果没有找到闭合括号，返回从开始位置到文本结尾
			
 
				+    return text[start_idx:]
			
 
				+
			
 
				+
			
 
				+def process_webpage_with_QWen(markdown_text):
			
 
				+    """
			
 
				+    使用阿里云的 Qwen VL Max 模型解析网页 markdown 文本中的名片信息
			
 
				+    
			
 
				+    Args:
			
 
				+        markdown_text (str): 网页的 markdown 格式文本内容
			
 
				+        
			
 
				+    Returns:
			
 
				+        dict: 解析的名片信息
			
 
				+    """
			
 
				+    # 阿里云 Qwen API 配置
			
 
				+    QWEN_API_KEY = os.environ.get('QWEN_API_KEY', 'sk-8f2320dafc9e4076968accdd8eebd8e9')
			
 
				+    
			
 
				+    try:
			
 
				+        # 初始化 OpenAI 客户端，配置为阿里云 API
			
 
				+        client = OpenAI(
			
 
				+            api_key=QWEN_API_KEY,
			
 
				+            base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
			
 
				+        )
			
 
				+        
			
 
				+        # 构建针对网页文本的优化提示语
			
 
				+        prompt = """你是数据处理专家。请仔细分析提供的网页Markdown文本内容，精确提取其中个人信息。
			
 
				+
			
 
				+## 提取要求
			
 
				+- 区分中英文内容，分别提取
			
 
				+- 保持提取信息的原始格式（如大小写、标点）
			
 
				+- 对于无法识别或文本中不存在的信息，返回空字符串
			
 
				+- 文本中没有的信息，请不要猜测
			
 
				+- 从网页内容中识别人员简介、联系方式、职位信息等
			
 
				+
			
 
				+## 需提取的字段
			
 
				+1. 中文姓名 (name_zh)
			
 
				+2. 英文姓名 (name_en)
			
 
				+3. 中文职位/头衔 (title_zh)
			
 
				+4. 英文职位/头衔 (title_en)
			
 
				+5. 中文酒店/公司名称 (hotel_zh)
			
 
				+6. 英文酒店/公司名称 (hotel_en)
			
 
				+7. 手机号码 (mobile) - 如有多个手机号码，使用逗号分隔，最多提取3个
			
 
				+8. 固定电话 (phone) - 如有多个，使用逗号分隔
			
 
				+9. 电子邮箱 (email)
			
 
				+10. 中文地址 (address_zh)
			
 
				+11. 英文地址 (address_en)
			
 
				+12. 中文邮政编码 (postal_code_zh)
			
 
				+13. 英文邮政编码 (postal_code_en)
			
 
				+14. 生日 (birthday) - 格式为YYYY-MM-DD，如1990-01-01
			
 
				+15. 年龄 (age) - 数字格式，如30
			
 
				+16. 籍贯 (native_place) - 出生地或户籍所在地信息
			
 
				+17. 居住地 (residence) - 个人居住地址信息
			
 
				+18. 品牌组合 (brand_group) - 如有多个品牌，使用逗号分隔
			
 
				+19. 职业轨迹 (career_path) - 如能从文本中推断，以JSON数组格式返回，包含当前日期，公司名称和职位。自动生成当前日期。
			
 
				+20. 隶属关系 (affiliation) - 如能从文本中推断，以JSON数组格式返回，包含公司名称和隶属集团名称
			
 
				+21. 照片链接 (pic_url) - 人物的照片URL链接，从网页中的图片标签或链接中提取
			
 
				+
			
 
				+## 输出格式
			
 
				+请以严格的JSON格式返回结果，不要添加任何额外解释文字。JSON格式如下：
			
 
				+```json
			
 
				+{
			
 
				+  "name_zh": "",
			
 
				+  "name_en": "",
			
 
				+  "title_zh": "",
			
 
				+  "title_en": "",
			
 
				+  "hotel_zh": "",
			
 
				+  "hotel_en": "",
			
 
				+  "mobile": "",
			
 
				+  "phone": "",
			
 
				+  "email": "",
			
 
				+  "address_zh": "",
			
 
				+  "address_en": "",
			
 
				+  "postal_code_zh": "",
			
 
				+  "postal_code_en": "",
			
 
				+  "birthday": "",
			
 
				+  "age": 0,
			
 
				+  "native_place": "",
			
 
				+  "residence": "",
			
 
				+  "brand_group": "",
			
 
				+  "career_path": [],
			
 
				+  "affiliation": [],
			
 
				+  "pic_url": ""
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+以下是需要分析的网页Markdown文本内容：
			
 
				+
			
 
				+""" + markdown_text
			
 
				+        
			
 
				+        # 调用 Qwen VL Max API
			
 
				+        logging.info("发送网页文本请求到 Qwen VL Max 模型")
			
 
				+        completion = client.chat.completions.create(
			
 
				+            model="qwen-vl-max-latest",
			
 
				+            messages=[
			
 
				+                {
			
 
				+                    "role": "user",
			
 
				+                    "content": [
			
 
				+                        {"type": "text", "text": prompt}
			
 
				+                    ]
			
 
				+                }
			
 
				+            ],
			
 
				+            temperature=0.1,  # 降低温度增加精确性
			
 
				+            response_format={"type": "json_object"}  # 要求输出JSON格式
			
 
				+        )
			
 
				+        
			
 
				+        # 解析响应
			
 
				+        response_content = completion.choices[0].message.content
			
 
				+        logging.info(f"成功从 Qwen 模型获取网页文本响应: {response_content}")
			
 
				+        
			
 
				+        # 尝试从响应中提取 JSON
			
 
				+        try:
			
 
				+            json_content = extract_json_from_text(response_content)
			
 
				+            extracted_data = json.loads(json_content)
			
 
				+            logging.info("成功解析 Qwen 网页文本响应中的 JSON")
			
 
				+        except json.JSONDecodeError as e:
			
 
				+            error_msg = f"JSON 解析失败: {str(e)}"
			
 
				+            logging.error(error_msg)
			
 
				+            raise Exception(error_msg)
			
 
				+
			
 
				+        # 确保所有必要字段存在
			
 
				+        required_fields = [
			
 
				+            'name_zh', 'name_en', 'title_zh', 'title_en', 
			
 
				+            'hotel_zh', 'hotel_en', 'mobile', 'phone', 
			
 
				+            'email', 'address_zh', 'address_en',
			
 
				+            'postal_code_zh', 'postal_code_en', 'birthday', 'age', 'native_place', 'residence',
			
 
				+            'brand_group', 'career_path', 'pic_url'
			
 
				+        ]
			
 
				+        
			
 
				+        for field in required_fields:
			
 
				+            if field not in extracted_data:
			
 
				+                if field == 'career_path':
			
 
				+                    extracted_data[field] = []
			
 
				+                elif field == 'age':
			
 
				+                    extracted_data[field] = 0
			
 
				+                else:
			
 
				+                    extracted_data[field] = ""
			
 
				+        
			
 
				+        # 为career_path增加一条记录（如果有相关信息）
			
 
				+        if extracted_data.get('hotel_zh') or extracted_data.get('hotel_en') or extracted_data.get('title_zh') or extracted_data.get('title_en'):
			
 
				+            career_entry = {
			
 
				+                'date': datetime.now().strftime('%Y-%m-%d'),
			
 
				+                'hotel_en': extracted_data.get('hotel_en', ''),
			
 
				+                'hotel_zh': extracted_data.get('hotel_zh', ''),
			
 
				+                'image_path': '',
			
 
				+                'source': 'webpage_extraction',
			
 
				+                'title_en': extracted_data.get('title_en', ''),
			
 
				+                'title_zh': extracted_data.get('title_zh', '')
			
 
				+            }
			
 
				+            
			
 
				+            # 直接清空原有的career_path内容，用career_entry写入
			
 
				+            extracted_data['career_path'] = [career_entry]
			
 
				+            logging.info(f"为网页解析结果设置了career_path记录: {career_entry}")
			
 
				+        
			
 
				+        return extracted_data
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        error_msg = f"Qwen VL Max 模型网页文本解析失败: {str(e)}"
			
 
				+        logging.error(error_msg, exc_info=True)
			
 
				+        raise Exception(error_msg) 
			
--- a/webpage-parse-api-documentation.md
+++ b/webpage-parse-api-documentation.md
@@ -0,0 +1,637 @@
 
				+# 网页文本解析 API 接口说明文档
			
 
				+
			
 
				+## 接口概述
			
 
				+
			
 
				+**接口名称**: 网页文本解析接口  
			
 
				+**接口路径**: `POST /api/data_parse/webpage-parse`  
			
 
				+**接口功能**: 解析网页 Markdown 文本并提取人员信息  
			
 
				+**版本**: v1.0  
			
 
				+**最后更新**: 2024-12-19  
			
 
				+
			
 
				+## 功能描述
			
 
				+
			
 
				+该接口使用阿里云 Qwen VL Max 模型智能解析网页 Markdown 格式文本，自动提取其中的人员信息，包括姓名、职位、联系方式、公司信息、照片链接等多种字段。适用于从网页内容、人员简介、名片信息等文本中提取结构化数据。
			
 
				+
			
 
				+## 接口地址
			
 
				+
			
 
				+```
			
 
				+POST http://your-domain.com/api/data_parse/webpage-parse
			
 
				+```
			
 
				+
			
 
				+## 请求参数
			
 
				+
			
 
				+### 请求头 (Headers)
			
 
				+
			
 
				+| 参数名 | 类型 | 必填 | 说明 |
			
 
				+|--------|------|------|------|
			
 
				+| Content-Type | string | 是 | application/json |
			
 
				+
			
 
				+### 请求体 (Request Body)
			
 
				+
			
 
				+| 参数名 | 类型 | 必填 | 长度限制 | 说明 |
			
 
				+|--------|------|------|----------|------|
			
 
				+| markdown_text | string | 是 | 10-50000字符 | 网页的 Markdown 格式文本内容 |
			
 
				+
			
 
				+#### 请求体示例
			
 
				+
			
 
				+```json
			
 
				+{
			
 
				+    "markdown_text": "# 张三 - 高级酒店经理\n\n## 个人信息\n\n**职位**: 高级经理  \n**公司**: 香格里拉大酒店  \n**手机**: 13800138000  \n**邮箱**: zhangsan@example.com  \n**地址**: 北京市朝阳区建国路1号  \n\n![个人照片](https://example.com/photos/zhangsan.jpg)\n\n## 工作经历\n\n- 2020-至今: 香格里拉大酒店 高级经理\n- 2018-2020: 凯悦酒店 部门经理"
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+## 响应参数
			
 
				+
			
 
				+### 成功响应 (200 OK)
			
 
				+
			
 
				+```json
			
 
				+{
			
 
				+    "success": true,
			
 
				+    "message": "网页文本解析成功",
			
 
				+    "data": {
			
 
				+        "name_zh": "张三",
			
 
				+        "name_en": "Zhang San",
			
 
				+        "title_zh": "高级经理",
			
 
				+        "title_en": "Senior Manager",
			
 
				+        "hotel_zh": "香格里拉大酒店",
			
 
				+        "hotel_en": "Shangri-La Hotel",
			
 
				+        "mobile": "13800138000",
			
 
				+        "phone": "",
			
 
				+        "email": "zhangsan@example.com",
			
 
				+        "address_zh": "北京市朝阳区建国路1号",
			
 
				+        "address_en": "",
			
 
				+        "postal_code_zh": "",
			
 
				+        "postal_code_en": "",
			
 
				+        "birthday": "",
			
 
				+        "age": 0,
			
 
				+        "native_place": "",
			
 
				+        "residence": "",
			
 
				+        "brand_group": "",
			
 
				+        "career_path": [
			
 
				+            {
			
 
				+                "date": "2024-12-19",
			
 
				+                "hotel_zh": "香格里拉大酒店",
			
 
				+                "hotel_en": "Shangri-La Hotel",
			
 
				+                "title_zh": "高级经理",
			
 
				+                "title_en": "Senior Manager",
			
 
				+                "image_path": "",
			
 
				+                "source": "webpage_extraction"
			
 
				+            }
			
 
				+        ],
			
 
				+        "affiliation": [],
			
 
				+        "pic_url": "https://example.com/photos/zhangsan.jpg"
			
 
				+    }
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+### 响应字段说明
			
 
				+
			
 
				+| 字段名 | 类型 | 说明 |
			
 
				+|--------|------|------|
			
 
				+| success | boolean | 请求是否成功 |
			
 
				+| message | string | 响应消息 |
			
 
				+| data | object | 提取的人员信息数据 |
			
 
				+
			
 
				+#### data 对象字段说明
			
 
				+
			
 
				+| 字段名 | 类型 | 说明 |
			
 
				+|--------|------|------|
			
 
				+| name_zh | string | 中文姓名 |
			
 
				+| name_en | string | 英文姓名 |
			
 
				+| title_zh | string | 中文职位/头衔 |
			
 
				+| title_en | string | 英文职位/头衔 |
			
 
				+| hotel_zh | string | 中文酒店/公司名称 |
			
 
				+| hotel_en | string | 英文酒店/公司名称 |
			
 
				+| mobile | string | 手机号码（多个用逗号分隔，最多3个） |
			
 
				+| phone | string | 固定电话（多个用逗号分隔） |
			
 
				+| email | string | 电子邮箱 |
			
 
				+| address_zh | string | 中文地址 |
			
 
				+| address_en | string | 英文地址 |
			
 
				+| postal_code_zh | string | 中文邮政编码 |
			
 
				+| postal_code_en | string | 英文邮政编码 |
			
 
				+| birthday | string | 生日（格式：YYYY-MM-DD） |
			
 
				+| age | integer | 年龄 |
			
 
				+| native_place | string | 籍贯 |
			
 
				+| residence | string | 居住地 |
			
 
				+| brand_group | string | 品牌组合 |
			
 
				+| career_path | array | 职业轨迹（JSON数组） |
			
 
				+| affiliation | array | 隶属关系（JSON数组） |
			
 
				+| pic_url | string | 照片链接URL |
			
 
				+
			
 
				+## 错误响应
			
 
				+
			
 
				+### 400 - 请求参数错误
			
 
				+
			
 
				+```json
			
 
				+{
			
 
				+    "success": false,
			
 
				+    "message": "错误描述",
			
 
				+    "data": null
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+**常见错误消息**:
			
 
				+- `请求必须是 JSON 格式`
			
 
				+- `缺少必填参数: markdown_text`
			
 
				+- `markdown_text 必须是字符串类型`
			
 
				+- `markdown_text 内容不能为空`
			
 
				+- `markdown_text 内容过长，最大支持50KB`
			
 
				+- `markdown_text 内容过短，无法进行有效解析`
			
 
				+
			
 
				+### 500 - 服务器错误
			
 
				+
			
 
				+```json
			
 
				+{
			
 
				+    "success": false,
			
 
				+    "message": "网页文本解析失败: 具体错误信息",
			
 
				+    "data": null
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+## 前端调用示例
			
 
				+
			
 
				+### JavaScript (原生)
			
 
				+
			
 
				+```javascript
			
 
				+async function parseWebpageText(markdownText) {
			
 
				+    try {
			
 
				+        const response = await fetch('/api/data_parse/webpage-parse', {
			
 
				+            method: 'POST',
			
 
				+            headers: {
			
 
				+                'Content-Type': 'application/json',
			
 
				+            },
			
 
				+            body: JSON.stringify({
			
 
				+                markdown_text: markdownText
			
 
				+            })
			
 
				+        });
			
 
				+
			
 
				+        const result = await response.json();
			
 
				+        
			
 
				+        if (result.success) {
			
 
				+            console.log('解析成功:', result.data);
			
 
				+            return result.data;
			
 
				+        } else {
			
 
				+            console.error('解析失败:', result.message);
			
 
				+            throw new Error(result.message);
			
 
				+        }
			
 
				+    } catch (error) {
			
 
				+        console.error('请求失败:', error);
			
 
				+        throw error;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+// 使用示例
			
 
				+const markdownContent = `
			
 
				+# 李明 - 酒店总经理
			
 
				+
			
 
				+## 联系信息
			
 
				+- 手机: 13912345678
			
 
				+- 邮箱: liming@hotel.com
			
 
				+- 公司: 北京国际大酒店
			
 
				+
			
 
				+![照片](https://example.com/photos/liming.jpg)
			
 
				+`;
			
 
				+
			
 
				+parseWebpageText(markdownContent)
			
 
				+    .then(data => {
			
 
				+        console.log('提取的人员信息:', data);
			
 
				+    })
			
 
				+    .catch(error => {
			
 
				+        console.error('解析出错:', error);
			
 
				+    });
			
 
				+```
			
 
				+
			
 
				+### JavaScript (jQuery)
			
 
				+
			
 
				+```javascript
			
 
				+function parseWebpageTextWithJQuery(markdownText) {
			
 
				+    return $.ajax({
			
 
				+        url: '/api/data_parse/webpage-parse',
			
 
				+        type: 'POST',
			
 
				+        contentType: 'application/json',
			
 
				+        data: JSON.stringify({
			
 
				+            markdown_text: markdownText
			
 
				+        }),
			
 
				+        success: function(result) {
			
 
				+            if (result.success) {
			
 
				+                console.log('解析成功:', result.data);
			
 
				+                return result.data;
			
 
				+            } else {
			
 
				+                console.error('解析失败:', result.message);
			
 
				+            }
			
 
				+        },
			
 
				+        error: function(xhr, status, error) {
			
 
				+            console.error('请求失败:', error);
			
 
				+        }
			
 
				+    });
			
 
				+}
			
 
				+
			
 
				+// 使用示例
			
 
				+const markdownText = "# 王芳\n职位: 前台经理\n手机: 13800000000";
			
 
				+parseWebpageTextWithJQuery(markdownText);
			
 
				+```
			
 
				+
			
 
				+### Vue.js 示例
			
 
				+
			
 
				+```vue
			
 
				+<template>
			
 
				+  <div class="webpage-parser">
			
 
				+    <h2>网页文本解析</h2>
			
 
				+    
			
 
				+    <div class="input-section">
			
 
				+      <label for="markdown-input">输入 Markdown 文本:</label>
			
 
				+      <textarea
			
 
				+        id="markdown-input"
			
 
				+        v-model="markdownText"
			
 
				+        rows="10"
			
 
				+        cols="80"
			
 
				+        placeholder="请输入网页的 Markdown 格式文本..."
			
 
				+      ></textarea>
			
 
				+      <br>
			
 
				+      <button @click="parseText" :disabled="loading">
			
 
				+        {{ loading ? '解析中...' : '开始解析' }}
			
 
				+      </button>
			
 
				+    </div>
			
 
				+
			
 
				+    <div v-if="result" class="result-section">
			
 
				+      <h3>解析结果:</h3>
			
 
				+      <div v-if="result.success" class="success-result">
			
 
				+        <p><strong>姓名:</strong> {{ result.data.name_zh }} / {{ result.data.name_en }}</p>
			
 
				+        <p><strong>职位:</strong> {{ result.data.title_zh }} / {{ result.data.title_en }}</p>
			
 
				+        <p><strong>公司:</strong> {{ result.data.hotel_zh }} / {{ result.data.hotel_en }}</p>
			
 
				+        <p><strong>手机:</strong> {{ result.data.mobile }}</p>
			
 
				+        <p><strong>邮箱:</strong> {{ result.data.email }}</p>
			
 
				+        <p><strong>照片:</strong> 
			
 
				+          <img v-if="result.data.pic_url" :src="result.data.pic_url" alt="照片" style="max-width: 100px;">
			
 
				+          <span v-else>无</span>
			
 
				+        </p>
			
 
				+        <details>
			
 
				+          <summary>完整数据</summary>
			
 
				+          <pre>{{ JSON.stringify(result.data, null, 2) }}</pre>
			
 
				+        </details>
			
 
				+      </div>
			
 
				+      <div v-else class="error-result">
			
 
				+        <p style="color: red;">解析失败: {{ result.message }}</p>
			
 
				+      </div>
			
 
				+    </div>
			
 
				+  </div>
			
 
				+</template>
			
 
				+
			
 
				+<script>
			
 
				+import axios from 'axios';
			
 
				+
			
 
				+export default {
			
 
				+  name: 'WebpageParser',
			
 
				+  data() {
			
 
				+    return {
			
 
				+      markdownText: '',
			
 
				+      result: null,
			
 
				+      loading: false
			
 
				+    };
			
 
				+  },
			
 
				+  methods: {
			
 
				+    async parseText() {
			
 
				+      if (!this.markdownText.trim()) {
			
 
				+        alert('请输入 Markdown 文本');
			
 
				+        return;
			
 
				+      }
			
 
				+
			
 
				+      this.loading = true;
			
 
				+      this.result = null;
			
 
				+
			
 
				+      try {
			
 
				+        const response = await axios.post('/api/data_parse/webpage-parse', {
			
 
				+          markdown_text: this.markdownText
			
 
				+        });
			
 
				+
			
 
				+        this.result = response.data;
			
 
				+      } catch (error) {
			
 
				+        this.result = {
			
 
				+          success: false,
			
 
				+          message: error.response?.data?.message || error.message
			
 
				+        };
			
 
				+      } finally {
			
 
				+        this.loading = false;
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+};
			
 
				+</script>
			
 
				+
			
 
				+<style scoped>
			
 
				+.webpage-parser {
			
 
				+  max-width: 800px;
			
 
				+  margin: 0 auto;
			
 
				+  padding: 20px;
			
 
				+}
			
 
				+
			
 
				+.input-section {
			
 
				+  margin-bottom: 30px;
			
 
				+}
			
 
				+
			
 
				+.input-section label {
			
 
				+  display: block;
			
 
				+  margin-bottom: 10px;
			
 
				+  font-weight: bold;
			
 
				+}
			
 
				+
			
 
				+.input-section textarea {
			
 
				+  width: 100%;
			
 
				+  margin-bottom: 10px;
			
 
				+}
			
 
				+
			
 
				+.input-section button {
			
 
				+  padding: 10px 20px;
			
 
				+  background-color: #007bff;
			
 
				+  color: white;
			
 
				+  border: none;
			
 
				+  border-radius: 4px;
			
 
				+  cursor: pointer;
			
 
				+}
			
 
				+
			
 
				+.input-section button:disabled {
			
 
				+  background-color: #6c757d;
			
 
				+  cursor: not-allowed;
			
 
				+}
			
 
				+
			
 
				+.result-section {
			
 
				+  border: 1px solid #ddd;
			
 
				+  padding: 20px;
			
 
				+  border-radius: 4px;
			
 
				+}
			
 
				+
			
 
				+.success-result p {
			
 
				+  margin: 10px 0;
			
 
				+}
			
 
				+
			
 
				+.error-result {
			
 
				+  background-color: #f8d7da;
			
 
				+  padding: 10px;
			
 
				+  border-radius: 4px;
			
 
				+}
			
 
				+</style>
			
 
				+```
			
 
				+
			
 
				+### React 示例
			
 
				+
			
 
				+```jsx
			
 
				+import React, { useState } from 'react';
			
 
				+import axios from 'axios';
			
 
				+
			
 
				+function WebpageParser() {
			
 
				+  const [markdownText, setMarkdownText] = useState('');
			
 
				+  const [result, setResult] = useState(null);
			
 
				+  const [loading, setLoading] = useState(false);
			
 
				+
			
 
				+  const parseText = async () => {
			
 
				+    if (!markdownText.trim()) {
			
 
				+      alert('请输入 Markdown 文本');
			
 
				+      return;
			
 
				+    }
			
 
				+
			
 
				+    setLoading(true);
			
 
				+    setResult(null);
			
 
				+
			
 
				+    try {
			
 
				+      const response = await axios.post('/api/data_parse/webpage-parse', {
			
 
				+        markdown_text: markdownText
			
 
				+      });
			
 
				+
			
 
				+      setResult(response.data);
			
 
				+    } catch (error) {
			
 
				+      setResult({
			
 
				+        success: false,
			
 
				+        message: error.response?.data?.message || error.message
			
 
				+      });
			
 
				+    } finally {
			
 
				+      setLoading(false);
			
 
				+    }
			
 
				+  };
			
 
				+
			
 
				+  return (
			
 
				+    <div style={{ maxWidth: '800px', margin: '0 auto', padding: '20px' }}>
			
 
				+      <h2>网页文本解析</h2>
			
 
				+      
			
 
				+      <div style={{ marginBottom: '30px' }}>
			
 
				+        <label style={{ display: 'block', marginBottom: '10px', fontWeight: 'bold' }}>
			
 
				+          输入 Markdown 文本:
			
 
				+        </label>
			
 
				+        <textarea
			
 
				+          value={markdownText}
			
 
				+          onChange={(e) => setMarkdownText(e.target.value)}
			
 
				+          rows={10}
			
 
				+          cols={80}
			
 
				+          placeholder="请输入网页的 Markdown 格式文本..."
			
 
				+          style={{ width: '100%', marginBottom: '10px' }}
			
 
				+        />
			
 
				+        <br />
			
 
				+        <button
			
 
				+          onClick={parseText}
			
 
				+          disabled={loading}
			
 
				+          style={{
			
 
				+            padding: '10px 20px',
			
 
				+            backgroundColor: loading ? '#6c757d' : '#007bff',
			
 
				+            color: 'white',
			
 
				+            border: 'none',
			
 
				+            borderRadius: '4px',
			
 
				+            cursor: loading ? 'not-allowed' : 'pointer'
			
 
				+          }}
			
 
				+        >
			
 
				+          {loading ? '解析中...' : '开始解析'}
			
 
				+        </button>
			
 
				+      </div>
			
 
				+
			
 
				+      {result && (
			
 
				+        <div style={{ border: '1px solid #ddd', padding: '20px', borderRadius: '4px' }}>
			
 
				+          <h3>解析结果:</h3>
			
 
				+          {result.success ? (
			
 
				+            <div>
			
 
				+              <p><strong>姓名:</strong> {result.data.name_zh} / {result.data.name_en}</p>
			
 
				+              <p><strong>职位:</strong> {result.data.title_zh} / {result.data.title_en}</p>
			
 
				+              <p><strong>公司:</strong> {result.data.hotel_zh} / {result.data.hotel_en}</p>
			
 
				+              <p><strong>手机:</strong> {result.data.mobile}</p>
			
 
				+              <p><strong>邮箱:</strong> {result.data.email}</p>
			
 
				+              <p><strong>照片:</strong> 
			
 
				+                {result.data.pic_url ? (
			
 
				+                  <img src={result.data.pic_url} alt="照片" style={{ maxWidth: '100px' }} />
			
 
				+                ) : (
			
 
				+                  <span>无</span>
			
 
				+                )}
			
 
				+              </p>
			
 
				+              <details>
			
 
				+                <summary>完整数据</summary>
			
 
				+                <pre>{JSON.stringify(result.data, null, 2)}</pre>
			
 
				+              </details>
			
 
				+            </div>
			
 
				+          ) : (
			
 
				+            <div style={{ backgroundColor: '#f8d7da', padding: '10px', borderRadius: '4px' }}>
			
 
				+              <p style={{ color: 'red' }}>解析失败: {result.message}</p>
			
 
				+            </div>
			
 
				+          )}
			
 
				+        </div>
			
 
				+      )}
			
 
				+    </div>
			
 
				+  );
			
 
				+}
			
 
				+
			
 
				+export default WebpageParser;
			
 
				+```
			
 
				+
			
 
				+## 测试用例
			
 
				+
			
 
				+### 测试用例 1: 基本人员信息解析
			
 
				+
			
 
				+**输入**:
			
 
				+```json
			
 
				+{
			
 
				+    "markdown_text": "# 李华\n\n**职位**: 酒店经理  \n**公司**: 万豪酒店  \n**手机**: 13900139000  \n**邮箱**: lihua@marriott.com  \n\n![照片](https://example.com/photos/lihua.jpg)"
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+**预期输出**:
			
 
				+```json
			
 
				+{
			
 
				+    "success": true,
			
 
				+    "message": "网页文本解析成功",
			
 
				+    "data": {
			
 
				+        "name_zh": "李华",
			
 
				+        "title_zh": "酒店经理",
			
 
				+        "hotel_zh": "万豪酒店",
			
 
				+        "mobile": "13900139000",
			
 
				+        "email": "lihua@marriott.com",
			
 
				+        "pic_url": "https://example.com/photos/lihua.jpg",
			
 
				+        ...
			
 
				+    }
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+### 测试用例 2: 多语言信息解析
			
 
				+
			
 
				+**输入**:
			
 
				+```json
			
 
				+{
			
 
				+    "markdown_text": "# 王芳 Wang Fang\n\n## Position / 职位\n- Chinese: 前台经理\n- English: Front Desk Manager\n\n## Company / 公司\n- 中文: 北京希尔顿酒店\n- English: Beijing Hilton Hotel\n\n**Contact / 联系方式**:\n- Mobile: 13700137000\n- Email: wangfang@hilton.com"
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+### 测试用例 3: 复杂格式文档
			
 
				+
			
 
				+**输入**:
			
 
				+```json
			
 
				+{
			
 
				+    "markdown_text": "# 员工档案\n\n## 基本信息\n\n| 字段 | 中文 | English |\n|------|------|----------|\n| 姓名 | 陈强 | Chen Qiang |\n| 职位 | 总经理 | General Manager |\n| 公司 | 上海外滩茂悦大酒店 | The Mayfair Shanghai |\n\n## 联系方式\n\n- 📱 手机: 13600136000\n- 📞 电话: 021-12345678\n- 📧 邮箱: chenqiang@mayfair.com\n- 📍 地址: 上海市黄浦区南京东路189号\n- 📮 邮编: 200001\n\n## 照片\n\n![个人照片](https://example.com/photos/chenqiang.jpg)"
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+### 测试用例 4: 错误情况测试
			
 
				+
			
 
				+#### 4.1 空文本测试
			
 
				+**输入**:
			
 
				+```json
			
 
				+{
			
 
				+    "markdown_text": ""
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+**预期输出**:
			
 
				+```json
			
 
				+{
			
 
				+    "success": false,
			
 
				+    "message": "markdown_text 内容不能为空",
			
 
				+    "data": null
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+#### 4.2 文本过短测试
			
 
				+**输入**:
			
 
				+```json
			
 
				+{
			
 
				+    "markdown_text": "短文本"
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+**预期输出**:
			
 
				+```json
			
 
				+{
			
 
				+    "success": false,
			
 
				+    "message": "markdown_text 内容过短，无法进行有效解析",
			
 
				+    "data": null
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+#### 4.3 缺少参数测试
			
 
				+**输入**:
			
 
				+```json
			
 
				+{
			
 
				+    "other_field": "值"
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+**预期输出**:
			
 
				+```json
			
 
				+{
			
 
				+    "success": false,
			
 
				+    "message": "缺少必填参数: markdown_text",
			
 
				+    "data": null
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+## cURL 测试命令
			
 
				+
			
 
				+### 基本测试
			
 
				+```bash
			
 
				+curl -X POST "http://localhost:5000/api/data_parse/webpage-parse" \
			
 
				+  -H "Content-Type: application/json" \
			
 
				+  -d '{
			
 
				+    "markdown_text": "# 张三\n\n职位：经理\n公司：测试酒店\n手机：13800138000\nemail：test@example.com\n\n![照片](https://example.com/photo.jpg)"
			
 
				+  }'
			
 
				+```
			
 
				+
			
 
				+### 复杂文档测试
			
 
				+```bash
			
 
				+curl -X POST "http://localhost:5000/api/data_parse/webpage-parse" \
			
 
				+  -H "Content-Type: application/json" \
			
 
				+  -d '{
			
 
				+    "markdown_text": "# 李明 - 高级酒店经理\n\n## 个人信息\n\n**职位**: 高级经理  \n**公司**: 香格里拉大酒店  \n**手机**: 13800138000  \n**固话**: 010-12345678  \n**邮箱**: liming@shangri-la.com  \n**地址**: 北京市朝阳区建国路1号  \n**邮编**: 100001  \n\n![个人照片](https://example.com/photos/liming.jpg)\n\n## 工作经历\n\n- 2020-至今: 香格里拉大酒店 高级经理\n- 2018-2020: 凯悦酒店 部门经理\n- 2015-2018: 万豪酒店 主管"
			
 
				+  }'
			
 
				+```
			
 
				+
			
 
				+### 错误测试
			
 
				+```bash
			
 
				+# 测试空文本
			
 
				+curl -X POST "http://localhost:5000/api/data_parse/webpage-parse" \
			
 
				+  -H "Content-Type: application/json" \
			
 
				+  -d '{
			
 
				+    "markdown_text": ""
			
 
				+  }'
			
 
				+
			
 
				+# 测试缺少参数
			
 
				+curl -X POST "http://localhost:5000/api/data_parse/webpage-parse" \
			
 
				+  -H "Content-Type: application/json" \
			
 
				+  -d '{
			
 
				+    "other_field": "值"
			
 
				+  }'
			
 
				+```
			
 
				+
			
 
				+## 注意事项
			
 
				+
			
 
				+1. **文本长度限制**: 输入文本最小10个字符，最大50,000个字符（约50KB）
			
 
				+2. **API 调用频率**: 建议控制调用频率，避免过于频繁的请求
			
 
				+3. **编码格式**: 确保文本使用 UTF-8 编码，支持中英文混合内容
			
 
				+4. **图片链接**: 支持提取 Markdown 图片语法 `![alt](url)` 和 HTML 图片标签 `<img src="url">`
			
 
				+5. **数据准确性**: 提取结果依赖于输入文本的质量和 AI 模型的理解能力
			
 
				+6. **隐私安全**: 请确保不要上传包含敏感信息的文档
			
 
				+7. **错误处理**: 建议在前端实现适当的错误处理和重试机制
			
 
				+
			
 
				+## 更新日志
			
 
				+
			
 
				+### v1.0 (2024-12-19)
			
 
				+- 初始版本发布
			
 
				+- 支持基本人员信息提取
			
 
				+- 支持照片链接提取
			
 
				+- 完整的参数验证和错误处理
			
 
				+- 支持中英文混合内容解析
			
 
				+
			
 
				+---
			
 
				+
			
 
				+**联系方式**: 如有问题或建议，请联系开发团队。