mxl_citu
/
DataOps-platform


			
				
					
						
						
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073
							import logging
from typing import Dict, List, Optional, Any, Union
from datetime import datetime
import json
from app.core.llm.llm_service import llm_client, llm_sql
from app.core.graph.graph_operations import connect_graph, create_or_get_node, get_node, relationship_exists
from app.core.meta_data import translate_and_parse, get_formatted_time
from py2neo import Relationship
from app import db
from sqlalchemy import text

logger = logging.getLogger(__name__)

class DataFlowService:
    """数据流服务类，处理数据流相关的业务逻辑"""
    
    @staticmethod
    def get_dataflows(page: int = 1, page_size: int = 10, search: str = '') -> Dict[str, Any]:
        """
        获取数据流列表
        
        Args:
            page: 页码
            page_size: 每页大小
            search: 搜索关键词
            
        Returns:
            包含数据流列表和分页信息的字典
        """
        try:
            # 从图数据库查询数据流列表
            skip_count = (page - 1) * page_size
            
            # 构建搜索条件
            where_clause = ""
            params = {'skip': skip_count, 'limit': page_size}
            
            if search:
                where_clause = "WHERE n.name_zh CONTAINS $search OR n.description CONTAINS $search"
                params['search'] = search
            
            # 查询数据流列表
            query = f"""
            MATCH (n:DataFlow)
            {where_clause}
            RETURN n, id(n) as node_id
            ORDER BY n.created_at DESC
            SKIP $skip
            LIMIT $limit
            """
            
            # 获取Neo4j驱动（如果连接失败会抛出ConnectionError异常）
            try:
                with connect_graph().session() as session:
                    list_result = session.run(query, **params).data()
                    
                    # 查询总数
                    count_query = f"""
                    MATCH (n:DataFlow)
                    {where_clause}
                    RETURN count(n) as total
                    """
                    count_params = {'search': search} if search else {}
                    count_result = session.run(count_query, **count_params).single()
                    total = count_result['total'] if count_result else 0
            except Exception as e:
                # 确保 driver 被正确关闭，避免资源泄漏 - 这里不再需要手动关闭driver，因为connect_graph返回的可能是单例或新实例，
                # 但如果是新实例，我们没有引用它去关闭。如果connect_graph设计为每次返回新实例且需要关闭，
                # 那么之前的代码是对的。如果connect_graph返回单例，则不应关闭。
                # 根据用户反馈：The driver.close() call prematurely closes a shared driver instance.
                # 所以我们直接使用 session，并不关闭 driver。
                logger.error(f"查询数据流失败: {str(e)}")
                raise e
            
            # 格式化结果
            dataflows = []
            for record in list_result:
                node = record['n']
                dataflow = dict(node)
                dataflow['id'] = record['node_id']  # 使用查询返回的node_id
                dataflows.append(dataflow)
            
            return {
                'list': dataflows,
                'pagination': {
                    'page': page,
                    'page_size': page_size,
                    'total': total,
                    'total_pages': (total + page_size - 1) // page_size
                }
            }
        except Exception as e:
            logger.error(f"获取数据流列表失败: {str(e)}")
            raise e
    
    @staticmethod
    def get_dataflow_by_id(dataflow_id: int) -> Optional[Dict[str, Any]]:
        """
        根据ID获取数据流详情
        
        Args:
            dataflow_id: 数据流ID
            
        Returns:
            数据流详情字典，如果不存在则返回None
        """
        try:
            # 从Neo4j获取基本信息
            neo4j_query = """
            MATCH (n:DataFlow)
            WHERE id(n) = $dataflow_id
            OPTIONAL MATCH (n)-[:LABEL]-(la:DataLabel)
            RETURN n, id(n) as node_id,
                   collect(DISTINCT {id: id(la), name: la.name}) as tags
            """
            
            with connect_graph().session() as session:
                neo4j_result = session.run(neo4j_query, dataflow_id=dataflow_id).data()
                
                if not neo4j_result:
                    return None
                
                record = neo4j_result[0]
                node = record['n']
                dataflow = dict(node)
                dataflow['id'] = record['node_id']
                dataflow['tags'] = record['tags']
            
            # 从PostgreSQL获取额外信息
            pg_query = """
            SELECT 
                source_table,
                target_table,
                script_name,
                script_type,
                script_requirement,
                script_content,
                user_name,
                create_time,
                update_time,
                target_dt_column
            FROM dags.data_transform_scripts
            WHERE script_name = :script_name
            """
            
            with db.engine.connect() as conn:
                pg_result = conn.execute(text(pg_query), {"script_name": dataflow.get('name_zh')}).fetchone()
                
                if pg_result:
                    # 将PostgreSQL数据添加到结果中
                    dataflow.update({
                        'source_table': pg_result.source_table,
                        'target_table': pg_result.target_table,
                        'script_type': pg_result.script_type,
                        'script_requirement': pg_result.script_requirement,
                        'script_content': pg_result.script_content,
                        'created_by': pg_result.user_name,
                        'pg_created_at': pg_result.create_time,
                        'pg_updated_at': pg_result.update_time,
                        'target_dt_column': pg_result.target_dt_column
                    })
            
            return dataflow
            
        except Exception as e:
            logger.error(f"获取数据流详情失败: {str(e)}")
            raise e
    
    @staticmethod
    def create_dataflow(data: Dict[str, Any]) -> Dict[str, Any]:
        """
        创建新的数据流
        
        Args:
            data: 数据流配置数据
            
        Returns:
            创建的数据流信息
        """
        try:
            # 验证必填字段
            required_fields = ['name_zh', 'describe']
            for field in required_fields:
                if field not in data:
                    raise ValueError(f"缺少必填字段: {field}")
            
            dataflow_name = data['name_zh']
            
            # 使用LLM翻译名称生成英文名
            try:
                result_list = translate_and_parse(dataflow_name)
                name_en = result_list[0] if result_list else dataflow_name.lower().replace(' ', '_')
            except Exception as e:
                logger.warning(f"翻译失败，使用默认英文名: {str(e)}")
                name_en = dataflow_name.lower().replace(' ', '_')
            
            # 准备节点数据
            node_data = {
                'name_zh': dataflow_name,
                'name_en': name_en,
                'category': data.get('category', ''),
                'organization': data.get('organization', ''),
                'leader': data.get('leader', ''),
                'frequency': data.get('frequency', ''),
                'tag': data.get('tag', ''),
                'describe': data.get('describe', ''),
                'status': data.get('status', 'inactive'),
                'update_mode': data.get('update_mode', 'append'),
                'created_at': get_formatted_time(),
                'updated_at': get_formatted_time()
            }  
            
            # 创建或获取数据流节点
            dataflow_id = get_node('DataFlow', name=dataflow_name)
            if dataflow_id:
                raise ValueError(f"数据流 '{dataflow_name}' 已存在")
            
            dataflow_id = create_or_get_node('DataFlow', **node_data)
            
            # 处理标签关系
            tag_id = data.get('tag')
            if tag_id is not None:
                try:
                    DataFlowService._handle_tag_relationship(dataflow_id, tag_id)
                except Exception as e:
                    logger.warning(f"处理标签关系时出错: {str(e)}")
            
            # 成功创建图数据库节点后，写入PG数据库
            try:
                DataFlowService._save_to_pg_database(data, dataflow_name, name_en)
                logger.info(f"数据流信息已写入PG数据库: {dataflow_name}")
                
                # PG数据库记录成功写入后，在neo4j图数据库中创建script关系
                try:
                    DataFlowService._handle_script_relationships(data,dataflow_name,name_en)
                    logger.info(f"脚本关系创建成功: {dataflow_name}")
                except Exception as script_error:
                    logger.warning(f"创建脚本关系失败: {str(script_error)}")
                    
            except Exception as pg_error:
                logger.error(f"写入PG数据库失败: {str(pg_error)}")
                # 注意：这里可以选择回滚图数据库操作，但目前保持图数据库数据
                # 在实际应用中，可能需要考虑分布式事务
                
            # 返回创建的数据流信息
            # 查询创建的节点获取完整信息
            query = "MATCH (n:DataFlow {name_zh: $name_zh}) RETURN n, id(n) as node_id"
            with connect_graph().session() as session:
                id_result = session.run(query, name_zh=dataflow_name).single()
                if id_result:
                    dataflow_node = id_result['n']
                    node_id = id_result['node_id']
                    
                    # 将节点属性转换为字典
                    result = dict(dataflow_node)
                    result['id'] = node_id
                else:
                    # 如果查询失败，返回基本信息
                    result = {
                        'id': dataflow_id if isinstance(dataflow_id, int) else None,
                        'name_zh': dataflow_name,
                        'name_en': name_en,
                        'created_at': get_formatted_time()
                    }
            
            logger.info(f"创建数据流成功: {dataflow_name}")
            return result
            
        except Exception as e:
            logger.error(f"创建数据流失败: {str(e)}")
            raise e
    
    @staticmethod
    def _save_to_pg_database(data: Dict[str, Any], script_name: str, name_en: str):
        """
        将脚本信息保存到PG数据库
        
        Args:
            data: 包含脚本信息的数据
            script_name: 脚本名称
            name_en: 英文名称
        """
        try:
            # 提取脚本相关信息
            script_requirement = data.get('script_requirement', '')
            script_content = data.get('script_content', '')
            source_table = data.get('source_table', '').split(':')[-1] if ':' in data.get('source_table', '') else data.get('source_table', '')
            target_table = data.get('target_table', '').split(':')[-1] if ':' in data.get('target_table', '') else data.get('target_table', name_en)  # 如果没有指定目标表，使用英文名
            script_type = data.get('script_type', 'python')
            user_name = data.get('created_by', 'system')
            target_dt_column = data.get('target_dt_column', '')
            
            # 验证必需字段
            if not target_table:
                target_table = name_en
            if not script_name:
                raise ValueError("script_name不能为空")
            
            # 构建插入SQL
            insert_sql = text("""
                INSERT INTO dags.data_transform_scripts 
                (source_table, target_table, script_name, script_type, script_requirement, 
                 script_content, user_name, create_time, update_time, target_dt_column)
                VALUES 
                (:source_table, :target_table, :script_name, :script_type, :script_requirement,
                 :script_content, :user_name, :create_time, :update_time, :target_dt_column)
                ON CONFLICT (target_table, script_name) 
                DO UPDATE SET 
                    source_table = EXCLUDED.source_table,
                    script_type = EXCLUDED.script_type,
                    script_requirement = EXCLUDED.script_requirement,
                    script_content = EXCLUDED.script_content,
                    user_name = EXCLUDED.user_name,
                    update_time = EXCLUDED.update_time,
                    target_dt_column = EXCLUDED.target_dt_column
            """)
            
            # 准备参数
            current_time = datetime.now()
            params = {
                'source_table': source_table,
                'target_table': target_table,
                'script_name': script_name,
                'script_type': script_type,
                'script_requirement': script_requirement,
                'script_content': script_content,
                'user_name': user_name,
                'create_time': current_time,
                'update_time': current_time,
                'target_dt_column': target_dt_column
            }
            
            # 执行插入操作
            db.session.execute(insert_sql, params)
            
            # 新增：保存到task_list表
            try:
                # 1. 解析script_requirement并构建详细的任务描述
                task_description_md = script_requirement
                
                try:
                    # 尝试解析JSON
                    import json
                    try:
                        req_json = json.loads(script_requirement)
                    except (json.JSONDecodeError, TypeError):
                        req_json = None
                        
                    if isinstance(req_json, dict):
                        # 提取字段
                        business_domains = []
                        bd_str = req_json.get('business_domain', '')
                        if bd_str:
                            business_domains = [d.strip() for d in bd_str.split(',') if d.strip()]
                            
                        data_source = req_json.get('data_source', '')
                        request_content_str = req_json.get('request_content', '')
                        
                        # 生成Business Domain DDLs
                        domain_ddls = []
                        if business_domains:
                            try:
                                with connect_graph().session() as session:
                                    for domain in business_domains:
                                        # 查询BusinessDomain节点及元数据
                                        # 尝试匹配name, name_zh, name_en
                                        cypher = """
                                        MATCH (n:BusinessDomain)
                                        WHERE n.name = $name OR n.name_zh = $name OR n.name_en = $name
                                        OPTIONAL MATCH (n)-[:INCLUDES]->(m:DataMeta)
                                        RETURN n, collect(m) as metadata
                                        """
                                        result = session.run(cypher, name=domain).single()
                                        
                                        if result:
                                            node = result['n']
                                            metadata = result['metadata']
                                            
                                            # 生成DDL
                                            node_props = dict(node)
                                            # 优先使用英文名作为表名，如果没有则使用拼音或原始名称
                                            table_name = node_props.get('name_en', domain)
                                            
                                            ddl_lines = []
                                            ddl_lines.append(f"CREATE TABLE {table_name} (")
                                            
                                            if metadata:
                                                column_definitions = []
                                                for meta in metadata:
                                                    if meta:
                                                        meta_props = dict(meta)
                                                        column_name = meta_props.get('name_en', meta_props.get('name_zh', 'unknown_column'))
                                                        data_type = meta_props.get('data_type', 'VARCHAR(255)')
                                                        comment = meta_props.get('name_zh', '')
                                                        
                                                        column_def = f"    {column_name} {data_type}"
                                                        if comment:
                                                            column_def += f" COMMENT '{comment}'"
                                                        column_definitions.append(column_def)
                                                
                                                if column_definitions:
                                                    ddl_lines.append(",\n".join(column_definitions))
                                                else:
                                                    ddl_lines.append("    id BIGINT PRIMARY KEY COMMENT '主键ID'")
                                            else:
                                                ddl_lines.append("    id BIGINT PRIMARY KEY COMMENT '主键ID'")
                                                
                                            ddl_lines.append(");")
                                            
                                            table_comment = node_props.get('name_zh', node_props.get('describe', table_name))
                                            if table_comment and table_comment != table_name:
                                                ddl_lines.append(f"COMMENT ON TABLE {table_name} IS '{table_comment}';")
                                                
                                            domain_ddls.append("\n".join(ddl_lines))
                            except Exception as neo_e:
                                logger.error(f"获取BusinessDomain DDL失败: {str(neo_e)}")
                        
                        # 构建Markdown格式
                        task_desc_parts = [f"# Task: {script_name}\n"]
                        
                        if data_source:
                            task_desc_parts.append(f"## Data Source\n{data_source}\n")
                        
                        if domain_ddls:
                            task_desc_parts.append("## Business Domain Structures (DDL)")
                            for ddl in domain_ddls:
                                task_desc_parts.append(f"```sql\n{ddl}\n```\n")
                        
                        task_desc_parts.append(f"## Request Content\n{request_content_str}\n")
                        
                        task_desc_parts.append("## Implementation Steps")
                        task_desc_parts.append("1. Generate a Python program to implement the logic.")
                        task_desc_parts.append("2. Generate an n8n workflow to schedule and execute the Python program.")
                        
                        task_description_md = "\n".join(task_desc_parts)
                        
                except Exception as parse_e:
                    logger.warning(f"解析任务描述详情失败，使用原始描述: {str(parse_e)}")
                    task_description_md = script_requirement

                # 假设运行根目录为项目根目录，dataflows.py在app/core/data_flow/
                code_path = 'app/core/data_flow'
                
                task_insert_sql = text("""
                    INSERT INTO public.task_list 
                    (task_name, task_description, status, code_name, code_path, create_by, create_time, update_time)
                    VALUES 
                    (:task_name, :task_description, :status, :code_name, :code_path, :create_by, :create_time, :update_time)
                """)
                
                task_params = {
                    'task_name': script_name,
                    'task_description': task_description_md,
                    'status': 'pending',
                    'code_name': script_name,
                    'code_path': code_path,
                    'create_by': 'cursor',
                    'create_time': current_time,
                    'update_time': current_time
                }
                
                # 使用嵌套事务，确保task_list插入失败不影响主流程
                with db.session.begin_nested():
                    db.session.execute(task_insert_sql, task_params)
                
                logger.info(f"成功将任务信息写入task_list表: task_name={script_name}")
                
            except Exception as task_error:
                # 记录错误但不中断主流程
                logger.error(f"写入task_list表失败: {str(task_error)}")
                # 如果要求必须成功写入任务列表，则这里应该raise task_error
                # raise task_error 
            
            db.session.commit()
            
            logger.info(f"成功将脚本信息写入PG数据库: target_table={target_table}, script_name={script_name}")
            
        except Exception as e:
            db.session.rollback()
            logger.error(f"写入PG数据库失败: {str(e)}")
            raise e
    
    @staticmethod
    def _handle_children_relationships(dataflow_node, children_ids):
        """处理子节点关系"""
        logger.debug(f"处理子节点关系，原始children_ids: {children_ids}, 类型: {type(children_ids)}")
        
        # 确保children_ids是列表格式
        if not isinstance(children_ids, (list, tuple)):
            if children_ids is not None:
                children_ids = [children_ids]  # 如果是单个值，转换为列表
                logger.debug(f"将单个值转换为列表: {children_ids}")
            else:
                children_ids = []  # 如果是None，转换为空列表
                logger.debug("将None转换为空列表")
        
        for child_id in children_ids:
            try:
                # 查找子节点
                query = "MATCH (n) WHERE id(n) = $child_id RETURN n"
                with connect_graph().session() as session:
                    result = session.run(query, child_id=child_id).data()
                    
                    if result:
                        child_node = result[0]['n']
                        
                        # 获取dataflow_node的ID
                        dataflow_id = getattr(dataflow_node, 'identity', None)
                        if dataflow_id is None:
                            # 如果没有identity属性，从名称查询ID
                            query_id = "MATCH (n:DataFlow) WHERE n.name_zh = $name_zh RETURN id(n) as node_id"
                            id_result = session.run(query_id, name_zh=dataflow_node.get('name_zh')).single()
                            dataflow_id = id_result['node_id'] if id_result else None
                        
                        # 创建关系 - 使用ID调用relationship_exists
                        if dataflow_id and not relationship_exists(dataflow_id, 'child', child_id):
                            session.run("MATCH (a), (b) WHERE id(a) = $dataflow_id AND id(b) = $child_id CREATE (a)-[:child]->(b)", 
                                      dataflow_id=dataflow_id, child_id=child_id)
                            logger.info(f"创建子节点关系: {dataflow_id} -> {child_id}")
            except Exception as e:
                logger.warning(f"创建子节点关系失败 {child_id}: {str(e)}")
    
    @staticmethod
    def _handle_tag_relationship(dataflow_id, tag_id):
        """处理标签关系"""
        try:
            # 查找标签节点
            query = "MATCH (n:DataLabel) WHERE id(n) = $tag_id RETURN n"
            with connect_graph().session() as session:
                result = session.run(query, tag_id=tag_id).data()
                
                if result:
                    tag_node = result[0]['n']
                    
                    # 创建关系 - 使用ID调用relationship_exists
                    if dataflow_id and not relationship_exists(dataflow_id, 'LABEL', tag_id):
                        session.run("MATCH (a), (b) WHERE id(a) = $dataflow_id AND id(b) = $tag_id CREATE (a)-[:LABEL]->(b)", 
                                  dataflow_id=dataflow_id, tag_id=tag_id)
                        logger.info(f"创建标签关系: {dataflow_id} -> {tag_id}")
        except Exception as e:
            logger.warning(f"创建标签关系失败 {tag_id}: {str(e)}")
    
    @staticmethod
    def update_dataflow(dataflow_id: int, data: Dict[str, Any]) -> Optional[Dict[str, Any]]:
        """
        更新数据流
        
        Args:
            dataflow_id: 数据流ID
            data: 更新的数据
            
        Returns:
            更新后的数据流信息，如果不存在则返回None
        """
        try:
            # 查找节点
            query = "MATCH (n:DataFlow) WHERE id(n) = $dataflow_id RETURN n"
            with connect_graph().session() as session:
                result = session.run(query, dataflow_id=dataflow_id).data()
                
                if not result:
                    return None
                
                # 更新节点属性
                update_fields = []
                params = {'dataflow_id': dataflow_id}
                
                for key, value in data.items():
                    if key not in ['id', 'created_at']:  # 保护字段
                        if key == 'config' and isinstance(value, dict):
                            value = json.dumps(value, ensure_ascii=False)
                        update_fields.append(f"n.{key} = ${key}")
                        params[key] = value
                
                if update_fields:
                    params['updated_at'] = get_formatted_time()
                    update_fields.append("n.updated_at = $updated_at")
                    
                    update_query = f"""
                    MATCH (n:DataFlow) WHERE id(n) = $dataflow_id
                    SET {', '.join(update_fields)}
                    RETURN n, id(n) as node_id
                    """
                    
                    result = session.run(update_query, **params).data()
                    
                    if result:
                        node = result[0]['n']
                        updated_dataflow = dict(node)
                        updated_dataflow['id'] = result[0]['node_id']  # 使用查询返回的node_id
                        
                        logger.info(f"更新数据流成功: ID={dataflow_id}")
                        return updated_dataflow
                
                return None
            
        except Exception as e:
            logger.error(f"更新数据流失败: {str(e)}")
            raise e
    
    @staticmethod
    def delete_dataflow(dataflow_id: int) -> bool:
        """
        删除数据流
        
        Args:
            dataflow_id: 数据流ID
            
        Returns:
            删除是否成功
        """
        try:
            # 删除节点及其关系
            query = """
            MATCH (n:DataFlow) WHERE id(n) = $dataflow_id
            DETACH DELETE n
            RETURN count(n) as deleted_count
            """
            
            with connect_graph().session() as session:
                delete_result = session.run(query, dataflow_id=dataflow_id).single()
                result = delete_result['deleted_count'] if delete_result else 0
                
                if result and result > 0:
                    logger.info(f"删除数据流成功: ID={dataflow_id}")
                    return True
                
                return False
            
        except Exception as e:
            logger.error(f"删除数据流失败: {str(e)}")
            raise e
    
    @staticmethod
    def execute_dataflow(dataflow_id: int, params: Dict[str, Any] = None) -> Dict[str, Any]:
        """
        执行数据流
        
        Args:
            dataflow_id: 数据流ID
            params: 执行参数
            
        Returns:
            执行结果信息
        """
        try:
            # 检查数据流是否存在
            query = "MATCH (n:DataFlow) WHERE id(n) = $dataflow_id RETURN n"
            with connect_graph().session() as session:
                result = session.run(query, dataflow_id=dataflow_id).data()
                
                if not result:
                    raise ValueError(f"数据流不存在: ID={dataflow_id}")
            
            execution_id = f"exec_{dataflow_id}_{int(datetime.now().timestamp())}"
            
            # TODO: 这里应该实际执行数据流
            # 目前返回模拟结果
            result = {
                'execution_id': execution_id,
                'dataflow_id': dataflow_id,
                'status': 'running',
                'started_at': datetime.now().isoformat(),
                'params': params or {},
                'progress': 0
            }
            
            logger.info(f"开始执行数据流: ID={dataflow_id}, execution_id={execution_id}")
            return result
        except Exception as e:
            logger.error(f"执行数据流失败: {str(e)}")
            raise e
    
    @staticmethod
    def get_dataflow_status(dataflow_id: int) -> Dict[str, Any]:
        """
        获取数据流执行状态
        
        Args:
            dataflow_id: 数据流ID
            
        Returns:
            执行状态信息
        """
        try:
            # TODO: 这里应该查询实际的执行状态
            # 目前返回模拟状态
            query = "MATCH (n:DataFlow) WHERE id(n) = $dataflow_id RETURN n"
            with connect_graph().session() as session:
                result = session.run(query, dataflow_id=dataflow_id).data()
                
                if not result:
                    raise ValueError(f"数据流不存在: ID={dataflow_id}")
            
            status = ['running', 'completed', 'failed', 'pending'][dataflow_id % 4]
            
            return {
                'dataflow_id': dataflow_id,
                'status': status,
                'progress': 100 if status == 'completed' else (dataflow_id * 10) % 100,
                'started_at': datetime.now().isoformat(),
                'completed_at': datetime.now().isoformat() if status == 'completed' else None,
                'error_message': '执行过程中发生错误' if status == 'failed' else None
            }
        except Exception as e:
            logger.error(f"获取数据流状态失败: {str(e)}")
            raise e
    
    @staticmethod
    def get_dataflow_logs(dataflow_id: int, page: int = 1, page_size: int = 50) -> Dict[str, Any]:
        """
        获取数据流执行日志
        
        Args:
            dataflow_id: 数据流ID
            page: 页码
            page_size: 每页大小
            
        Returns:
            执行日志列表和分页信息
        """
        try:
            # TODO: 这里应该查询实际的执行日志
            # 目前返回模拟日志
            query = "MATCH (n:DataFlow) WHERE id(n) = $dataflow_id RETURN n"
            with connect_graph().session() as session:
                result = session.run(query, dataflow_id=dataflow_id).data()
                
                if not result:
                    raise ValueError(f"数据流不存在: ID={dataflow_id}")
            
            mock_logs = [
                {
                    'id': i,
                    'timestamp': datetime.now().isoformat(),
                    'level': ['INFO', 'WARNING', 'ERROR'][i % 3],
                    'message': f'数据流执行日志消息 {i}',
                    'component': ['source', 'transform', 'target'][i % 3]
                }
                for i in range(1, 101)
            ]
            
            # 分页处理
            total = len(mock_logs)
            start = (page - 1) * page_size
            end = start + page_size
            logs = mock_logs[start:end]
            
            return {
                'logs': logs,
                'pagination': {
                    'page': page,
                    'page_size': page_size,
                    'total': total,
                    'total_pages': (total + page_size - 1) // page_size
                }
            }
        except Exception as e:
            logger.error(f"获取数据流日志失败: {str(e)}")
            raise e

    @staticmethod
    def create_script(request_data: Union[Dict[str, Any], str]) -> str:
        """
        使用Deepseek模型生成SQL脚本
        
        Args:
            request_data: 包含input, output, request_content的请求数据字典，或JSON字符串
            
        Returns:
            生成的SQL脚本内容
        """
        try:
            logger.info(f"开始处理脚本生成请求: {request_data}")
            logger.info(f"request_data类型: {type(request_data)}")
            
            # 类型检查和处理
            if isinstance(request_data, str):
                logger.warning(f"request_data是字符串，尝试解析为JSON: {request_data}")
                try:
                    import json
                    request_data = json.loads(request_data)
                except json.JSONDecodeError as e:
                    raise ValueError(f"无法解析request_data为JSON: {str(e)}")
            
            if not isinstance(request_data, dict):
                raise ValueError(f"request_data必须是字典类型，实际类型: {type(request_data)}")
            
            # 1. 从传入的request_data中解析input, output, request_content内容
            input_data = request_data.get('input', '')
            output_data = request_data.get('output', '')
           
            request_content = request_data.get('request_data', '')
            
            # 如果request_content是HTML格式，提取纯文本
            if request_content and (request_content.startswith('<p>') or '<' in request_content):
                # 简单的HTML标签清理
                import re
                request_content = re.sub(r'<[^>]+>', '', request_content).strip()
            
            if not input_data or not output_data or not request_content:
                raise ValueError(f"缺少必要参数：input='{input_data}', output='{output_data}', request_content='{request_content[:100] if request_content else ''}' 不能为空")
            
            logger.info(f"解析得到 - input: {input_data}, output: {output_data}, request_content: {request_content}")
            
            # 2. 解析input中的多个数据表并生成源表DDL
            source_tables_ddl = []
            input_tables = []
            if input_data:
                tables = [table.strip() for table in input_data.split(',') if table.strip()]
                for table in tables:
                    ddl = DataFlowService._parse_table_and_get_ddl(table, 'input')
                    if ddl:
                        input_tables.append(table)
                        source_tables_ddl.append(ddl)
                    else:
                        logger.warning(f"无法获取输入表 {table} 的DDL结构")
            
            # 3. 解析output中的数据表并生成目标表DDL
            target_table_ddl = ""
            if output_data:
                target_table_ddl = DataFlowService._parse_table_and_get_ddl(output_data.strip(), 'output')
                if not target_table_ddl:
                    logger.warning(f"无法获取输出表 {output_data} 的DDL结构")
            
            # 4. 按照Deepseek-prompt.txt的框架构建提示语
            prompt_parts = []
            
            # 开场白 - 角色定义
            prompt_parts.append("你是一名数据库工程师，正在构建一个PostgreSQL数据中的汇总逻辑。请为以下需求生成一段标准的 PostgreSQL SQL 脚本：")
            
            # 动态生成源表部分（第1点）
            for i, (table, ddl) in enumerate(zip(input_tables, source_tables_ddl), 1):
                table_name = table.split(':')[-1] if ':' in table else table
                prompt_parts.append(f"{i}.有一个源表: {table_name}，它的定义语句如下：")
                prompt_parts.append(ddl)
                prompt_parts.append("")  # 添加空行分隔
            
            # 动态生成目标表部分（第2点）
            if target_table_ddl:
                target_table_name = output_data.split(':')[-1] if ':' in output_data else output_data
                next_index = len(input_tables) + 1
                prompt_parts.append(f"{next_index}.有一个目标表：{target_table_name}，它的定义语句如下：")
                prompt_parts.append(target_table_ddl)
                prompt_parts.append("")  # 添加空行分隔
            
            # 动态生成处理逻辑部分（第3点）
            next_index = len(input_tables) + 2 if target_table_ddl else len(input_tables) + 1
            prompt_parts.append(f"{next_index}.处理逻辑为：{request_content}")
            prompt_parts.append("")  # 添加空行分隔
            
            # 固定的技术要求部分（第4-8点）
            tech_requirements = [
                f"{next_index + 1}.脚本应使用标准的 PostgreSQL 语法，适合在 Airflow、Python 脚本、或调度系统中调用；",
                f"{next_index + 2}.无需使用 UPSERT 或 ON CONFLICT",
                f"{next_index + 3}.请直接输出SQL，无需进行解释。",
                f"{next_index + 4}.请给这段sql起个英文名，不少于三个英文单词，使用\"_\"分隔，采用蛇形命名法。把sql的名字作为注释写在返回的sql中。",
                f"{next_index + 5}.生成的sql在向目标表插入数据的时候，向create_time字段写入当前日期时间now(),不用处理update_time字段"
            ]
            
            prompt_parts.extend(tech_requirements)
            
            # 组合完整的提示语
            full_prompt = "\n".join(prompt_parts)
            
            logger.info(f"构建的完整提示语长度: {len(full_prompt)}")
            logger.info(f"完整提示语内容: {full_prompt}")
            
            # 5. 调用LLM生成SQL脚本
            logger.info("开始调用Deepseek模型生成SQL脚本")
            script_content = llm_sql(full_prompt)
            
            if not script_content:
                raise ValueError("Deepseek模型返回空内容")
            
            # 确保返回的是文本格式
            if not isinstance(script_content, str):
                script_content = str(script_content)
            
            logger.info(f"SQL脚本生成成功，内容长度: {len(script_content)}")
            
            return script_content
            
        except Exception as e:
            logger.error(f"生成SQL脚本失败: {str(e)}")
            raise e

    @staticmethod
    def _parse_table_and_get_ddl(table_str: str, table_type: str) -> str:
        """
        解析表格式（A:B）并从Neo4j查询元数据生成DDL
        
        Args:
            table_str: 表格式字符串，格式为"label:name_en"
            table_type: 表类型，用于日志记录（input/output）
            
        Returns:
            DDL格式的表结构字符串
        """
        try:
            # 解析A:B格式
            if ':' not in table_str:
                logger.error(f"表格式错误，应为'label:name_en'格式: {table_str}")
                return ""
            
            parts = table_str.split(':', 1)
            if len(parts) != 2:
                logger.error(f"表格式解析失败: {table_str}")
                return ""
            
            label = parts[0].strip()
            name_en = parts[1].strip()
            
            if not label or not name_en:
                logger.error(f"标签或英文名为空: label={label}, name_en={name_en}")
                return ""
            
            logger.info(f"开始查询{table_type}表: label={label}, name_en={name_en}")
            
            # 从Neo4j查询节点及其关联的元数据
            with connect_graph().session() as session:
                # 查询节点及其关联的元数据
                cypher = f"""
                MATCH (n:{label} {{name_en: $name_en}})
                OPTIONAL MATCH (n)-[:INCLUDES]->(m:DataMeta)
                RETURN n, collect(m) as metadata
                """
                
                result = session.run(cypher, name_en=name_en)
                record = result.single()
                
                if not record:
                    logger.error(f"未找到节点: label={label}, name_en={name_en}")
                    return ""
                
                node = record['n']
                metadata = record['metadata']
                
                logger.info(f"找到节点，关联元数据数量: {len(metadata)}")
                
                # 生成DDL格式的表结构
                ddl_lines = []
                ddl_lines.append(f"CREATE TABLE {name_en} (")
                
                if metadata:
                    column_definitions = []
                    for meta in metadata:
                        if meta:  # 确保meta不为空
                            meta_props = dict(meta)
                            column_name = meta_props.get('name_en', meta_props.get('name_zh', 'unknown_column'))
                            data_type = meta_props.get('data_type', 'VARCHAR(255)')
                            comment = meta_props.get('name_zh', '')
                            
                            # 构建列定义
                            column_def = f"    {column_name} {data_type}"
                            if comment:
                                column_def += f" COMMENT '{comment}'"
                            
                            column_definitions.append(column_def)
                    
                    if column_definitions:
                        ddl_lines.append(",\n".join(column_definitions))
                    else:
                        ddl_lines.append("    id BIGINT PRIMARY KEY COMMENT '主键ID'")
                else:
                    # 如果没有元数据，添加默认列
                    ddl_lines.append("    id BIGINT PRIMARY KEY COMMENT '主键ID'")
                
                ddl_lines.append(");")
                
                # 添加表注释
                node_props = dict(node)
                table_comment = node_props.get('name_zh', node_props.get('describe', name_en))
                if table_comment and table_comment != name_en:
                    ddl_lines.append(f"COMMENT ON TABLE {name_en} IS '{table_comment}';")
                
                ddl_content = "\n".join(ddl_lines)
                logger.info(f"{table_type}表DDL生成成功: {name_en}")
                logger.debug(f"生成的DDL: {ddl_content}")
                
                return ddl_content
                
        except Exception as e:
            logger.error(f"解析表格式和生成DDL失败: {str(e)}")
            return ""

    @staticmethod
    def _handle_script_relationships(data: Dict[str, Any],dataflow_name:str,name_en:str):
        """
        处理脚本关系，在Neo4j图数据库中创建从source_table到target_table之间的DERIVED_FROM关系
        
        Args:
            data: 包含脚本信息的数据字典，应包含script_name, script_type, schedule_status, source_table, target_table, update_mode
        """
        try:
            # 从data中读取键值对
            script_name = dataflow_name,
            script_type = data.get('script_type', 'sql')
            schedule_status = data.get('status', 'inactive')
            source_table_full = data.get('source_table', '')
            target_table_full = data.get('target_table', '')
            update_mode = data.get('update_mode', 'full')
            
            # 处理source_table和target_table的格式
            source_table = source_table_full.split(':')[-1] if ':' in source_table_full else source_table_full
            target_table = target_table_full.split(':')[-1] if ':' in target_table_full else target_table_full
            source_label = source_table_full.split(':')[0] if ':' in source_table_full else source_table_full
            target_label = target_table_full.split(':')[0] if ':' in target_table_full else target_table_full
            
            # 验证必要字段
            if not source_table or not target_table:
                logger.warning(f"source_table或target_table为空，跳过关系创建: source_table={source_table}, target_table={target_table}")
                return
            
            logger.info(f"开始创建脚本关系: {source_table} -> {target_table}")
            
            with connect_graph().session() as session:
                # 创建或获取source和target节点
                create_nodes_query = f"""
                MERGE (source:{source_label} {{name: $source_table}})
                ON CREATE SET source.created_at = $created_at,
                             source.type = 'source'
                WITH source
                MERGE (target:{target_label} {{name: $target_table}})
                ON CREATE SET target.created_at = $created_at,
                             target.type = 'target'
                RETURN source, target, id(source) as source_id, id(target) as target_id
                """
                
                # 执行创建节点的查询
                result = session.run(create_nodes_query,
                                   source_table=source_table,
                                   target_table=target_table,
                                   created_at=get_formatted_time()).single()
                
                if result:
                    source_id = result['source_id']
                    target_id = result['target_id']
                    
                    # 检查并创建关系
                    create_relationship_query = f"""
                    MATCH (source:{source_label}), (target:{target_label})
                    WHERE id(source) = $source_id AND id(target) = $target_id
                    AND NOT EXISTS((target)-[:DERIVED_FROM]->(source))
                    CREATE (target)-[r:DERIVED_FROM]->(source)
                    SET r.script_name = $script_name,
                        r.script_type = $script_type,
                        r.schedule_status = $schedule_status,
                        r.update_mode = $update_mode,
                        r.created_at = $created_at,
                        r.updated_at = $created_at
                    RETURN r
                    """
                    
                    relationship_result = session.run(create_relationship_query,
                                                   source_id=source_id,
                                                   target_id=target_id,
                                                   script_name=script_name,
                                                   script_type=script_type,
                                                   schedule_status=schedule_status,
                                                   update_mode=update_mode,
                                                   created_at=get_formatted_time()).single()
                    
                    if relationship_result:
                        logger.info(f"成功创建DERIVED_FROM关系: {target_table} -> {source_table} (script: {script_name})")
                    else:
                        logger.info(f"DERIVED_FROM关系已存在: {target_table} -> {source_table}")
                else:
                    logger.error(f"创建表节点失败: source_table={source_table}, target_table={target_table}")
                    
        except Exception as e:
            logger.error(f"处理脚本关系失败: {str(e)}")
            raise e