Prechádzať zdrojové kódy

拆分数据解析模块为三个模块
1. 系统数据解析模块
2. 任务数据解析模块
3. 网页数据解析模块
同步更新对应API接口。

maxiaolong 1 deň pred
rodič
commit
444f95bd81

+ 5 - 2
app/api/data_parse/routes.py

@@ -1,6 +1,6 @@
 from flask import jsonify, request, make_response, Blueprint, current_app, send_file
 from app.api.data_parse import bp
-from app.core.data_parse.parse import (
+from app.core.data_parse.parse_system import (
     update_business_card, 
     get_business_cards, 
     update_business_card_status, 
@@ -16,7 +16,10 @@ from app.core.data_parse.parse import (
     get_duplicate_records, 
     process_duplicate_record, 
     get_duplicate_record_detail, 
-    fix_broken_duplicate_records, 
+    fix_broken_duplicate_records
+)
+# 导入解析任务相关函数
+from app.core.data_parse.parse_task import (
     get_parse_tasks, 
     get_parse_task_detail
 )

+ 3 - 3
app/core/data_parse/parse_card.py

@@ -9,7 +9,7 @@ import uuid
 from app.config.config import DevelopmentConfig, ProductionConfig
 
 # 导入原有的函数和模型
-from app.core.data_parse.parse import (
+from app.core.data_parse.parse_system import (
     BusinessCard, DuplicateBusinessCard,
     parse_text_with_qwen25VLplus, check_duplicate_business_card,
     update_career_path, create_main_card_with_duplicates
@@ -196,7 +196,7 @@ def add_business_card(card_data, image_file=None):
                 existing_card = duplicate_check['existing_card']
                 
                 # 导入手机号码处理函数
-                from app.core.data_parse.parse import normalize_mobile_numbers, merge_mobile_numbers
+                from app.core.data_parse.parse_system import normalize_mobile_numbers, merge_mobile_numbers
                 
                 # 更新基本信息
                 existing_card.name_en = card_data.get('name_en', existing_card.name_en)
@@ -307,7 +307,7 @@ def add_business_card(card_data, image_file=None):
                 initial_career_path = [initial_entry]
                 
                 # 导入手机号码处理函数
-                from app.core.data_parse.parse import normalize_mobile_numbers
+                from app.core.data_parse.parse_system import normalize_mobile_numbers
                 
                 # 处理年龄字段,确保是有效的整数或None
                 age_value = None

+ 0 - 111
app/core/data_parse/parse.py → app/core/data_parse/parse_system.py

@@ -1057,117 +1057,6 @@ def fix_broken_duplicate_records():
         }
 
 
-def get_parse_tasks(page=1, per_page=10, task_type=None, task_status=None):
-    """
-    获取解析任务列表
-    
-    Args:
-        page (int): 页码
-        per_page (int): 每页记录数
-        task_type (str): 任务类型过滤
-        task_status (str): 任务状态过滤
-        
-    Returns:
-        dict: 包含查询结果和分页信息
-    """
-    try:
-        if page < 1 or per_page < 1 or per_page > 100:
-            return {
-                'code': 400,
-                'success': False,
-                'message': '分页参数错误',
-                'data': None
-            }
-        
-        query = ParseTaskRepository.query
-        
-        if task_type:
-            query = query.filter_by(task_type=task_type)
-        if task_status:
-            query = query.filter_by(task_status=task_status)
-        
-        query = query.order_by(ParseTaskRepository.created_at.desc())
-        
-        pagination = query.paginate(page=page, per_page=per_page, error_out=False)
-        
-        tasks = [task.to_dict() for task in pagination.items]
-        
-        return {
-            'code': 200,
-            'success': True,
-            'message': '获取解析任务列表成功',
-            'data': {
-                'tasks': tasks,
-                'pagination': {
-                    'page': page,
-                    'per_page': per_page,
-                    'total': pagination.total,
-                    'pages': pagination.pages,
-                    'has_next': pagination.has_next,
-                    'has_prev': pagination.has_prev
-                }
-            }
-        }
-    
-    except Exception as e:
-        error_msg = f"获取解析任务列表失败: {str(e)}"
-        logging.error(error_msg, exc_info=True)
-        
-        return {
-            'code': 500,
-            'success': False,
-            'message': error_msg,
-            'data': None
-        }
-
-
-def get_parse_task_detail(task_name):
-    """
-    获取解析任务详情
-    
-    Args:
-        task_name (str): 任务名称
-        
-    Returns:
-        dict: 包含查询结果
-    """
-    try:
-        if not task_name:
-            return {
-                'code': 400,
-                'success': False,
-                'message': '任务名称不能为空',
-                'data': None
-            }
-        
-        task = ParseTaskRepository.query.filter_by(task_name=task_name).first()
-        
-        if not task:
-            return {
-                'code': 404,
-                'success': False,
-                'message': f'未找到任务名称为 {task_name} 的记录',
-                'data': None
-            }
-        
-        return {
-            'code': 200,
-            'success': True,
-            'message': f'成功获取任务 {task_name} 的详细信息',
-            'data': task.to_dict()
-        }
-    
-    except Exception as e:
-        error_msg = f"获取解析任务详情失败: {str(e)}"
-        logging.error(error_msg, exc_info=True)
-        
-        return {
-            'code': 500,
-            'success': False,
-            'message': error_msg,
-            'data': None
-        }
-
 
 def create_talent_tag(tag_data):
     """

+ 116 - 0
app/core/data_parse/parse_task.py

@@ -0,0 +1,116 @@
+from app import db
+from datetime import datetime
+import logging
+from .parse_system import ParseTaskRepository
+
+
+def get_parse_tasks(page=1, per_page=10, task_type=None, task_status=None):
+    """
+    获取解析任务列表
+    
+    Args:
+        page (int): 页码
+        per_page (int): 每页记录数
+        task_type (str): 任务类型过滤
+        task_status (str): 任务状态过滤
+        
+    Returns:
+        dict: 包含查询结果和分页信息
+    """
+    try:
+        if page < 1 or per_page < 1 or per_page > 100:
+            return {
+                'code': 400,
+                'success': False,
+                'message': '分页参数错误',
+                'data': None
+            }
+        
+        query = ParseTaskRepository.query
+        
+        if task_type:
+            query = query.filter_by(task_type=task_type)
+        if task_status:
+            query = query.filter_by(task_status=task_status)
+        
+        query = query.order_by(ParseTaskRepository.created_at.desc())
+        
+        pagination = query.paginate(page=page, per_page=per_page, error_out=False)
+        
+        tasks = [task.to_dict() for task in pagination.items]
+        
+        return {
+            'code': 200,
+            'success': True,
+            'message': '获取解析任务列表成功',
+            'data': {
+                'tasks': tasks,
+                'pagination': {
+                    'page': page,
+                    'per_page': per_page,
+                    'total': pagination.total,
+                    'pages': pagination.pages,
+                    'has_next': pagination.has_next,
+                    'has_prev': pagination.has_prev
+                }
+            }
+        }
+    
+    except Exception as e:
+        error_msg = f"获取解析任务列表失败: {str(e)}"
+        logging.error(error_msg, exc_info=True)
+        
+        return {
+            'code': 500,
+            'success': False,
+            'message': error_msg,
+            'data': None
+        }
+
+
+def get_parse_task_detail(task_name):
+    """
+    获取解析任务详情
+    
+    Args:
+        task_name (str): 任务名称
+        
+    Returns:
+        dict: 包含查询结果
+    """
+    try:
+        if not task_name:
+            return {
+                'code': 400,
+                'success': False,
+                'message': '任务名称不能为空',
+                'data': None
+            }
+        
+        task = ParseTaskRepository.query.filter_by(task_name=task_name).first()
+        
+        if not task:
+            return {
+                'code': 404,
+                'success': False,
+                'message': f'未找到任务名称为 {task_name} 的记录',
+                'data': None
+            }
+        
+        return {
+            'code': 200,
+            'success': True,
+            'message': f'成功获取任务 {task_name} 的详细信息',
+            'data': task.to_dict()
+        }
+    
+    except Exception as e:
+        error_msg = f"获取解析任务详情失败: {str(e)}"
+        logging.error(error_msg, exc_info=True)
+        
+        return {
+            'code': 500,
+            'success': False,
+            'message': error_msg,
+            'data': None
+        } 

+ 2 - 2
app/core/data_parse/parse_web.py

@@ -12,7 +12,7 @@ from openai import OpenAI
 
 # 导入配置和业务逻辑模块
 from app.config.config import DevelopmentConfig, ProductionConfig
-from app.core.data_parse.parse import (
+from app.core.data_parse.parse_system import (
     BusinessCard, check_duplicate_business_card, 
     create_main_card_with_duplicates, update_career_path,
     normalize_mobile_numbers, ParseTaskRepository
@@ -406,7 +406,7 @@ def process_single_talent_card(talent_data, minio_md_path):
                 new_mobile = normalize_mobile_numbers(talent_data.get('mobile', ''))
                 if new_mobile:
                     # 合并手机号码
-                    from app.core.data_parse.parse import merge_mobile_numbers
+                    from app.core.data_parse.parse_system import merge_mobile_numbers
                     existing_card.mobile = merge_mobile_numbers(existing_card.mobile, new_mobile)
                 elif talent_data.get('mobile') == '':
                     existing_card.mobile = ''

+ 1 - 1
fix_duplicate_records.py

@@ -24,7 +24,7 @@ sys.path.insert(0, project_root)
 
 try:
     from app import create_app, db
-    from app.core.data_parse.parse import DuplicateBusinessCard
+    from app.core.data_parse.parse_system import DuplicateBusinessCard
     import logging
     from datetime import datetime