|
@@ -199,38 +199,7 @@ class DuplicateBusinessCard(db.Model):
|
|
|
}
|
|
|
|
|
|
|
|
|
-# 解析任务存储库数据模型
|
|
|
-class ParseTaskRepository(db.Model):
|
|
|
- __tablename__ = 'parse_task_repository'
|
|
|
-
|
|
|
- id = db.Column(db.Integer, primary_key=True, autoincrement=True)
|
|
|
- task_name = db.Column(db.String(100), nullable=False)
|
|
|
- task_status = db.Column(db.String(10), nullable=False)
|
|
|
- task_type = db.Column(db.String(50), nullable=False)
|
|
|
- task_source = db.Column(db.JSON, nullable=False)
|
|
|
- collection_count = db.Column(db.Integer, nullable=False, default=0)
|
|
|
- parse_count = db.Column(db.Integer, nullable=False, default=0)
|
|
|
- parse_result = db.Column(db.JSON)
|
|
|
- created_at = db.Column(db.DateTime, default=datetime.now, nullable=False)
|
|
|
- created_by = db.Column(db.String(50), nullable=False)
|
|
|
- updated_at = db.Column(db.DateTime, default=datetime.now, onupdate=datetime.now, nullable=False)
|
|
|
- updated_by = db.Column(db.String(50), nullable=False)
|
|
|
-
|
|
|
- def to_dict(self):
|
|
|
- return {
|
|
|
- 'id': self.id,
|
|
|
- 'task_name': self.task_name,
|
|
|
- 'task_status': self.task_status,
|
|
|
- 'task_type': self.task_type,
|
|
|
- 'task_source': self.task_source,
|
|
|
- 'collection_count': self.collection_count,
|
|
|
- 'parse_count': self.parse_count,
|
|
|
- 'parse_result': self.parse_result,
|
|
|
- 'created_at': self.created_at.strftime('%Y-%m-%d %H:%M:%S') if self.created_at else None,
|
|
|
- 'created_by': self.created_by,
|
|
|
- 'updated_at': self.updated_at.strftime('%Y-%m-%d %H:%M:%S') if self.updated_at else None,
|
|
|
- 'updated_by': self.updated_by
|
|
|
- }
|
|
|
+
|
|
|
|
|
|
|
|
|
# 配置变量
|
|
@@ -1690,15 +1659,15 @@ def query_neo4j_graph(query_requirement):
|
|
|
logging.info("第一步:从Neo4j获取人才类别的标签列表")
|
|
|
all_labels_query = """
|
|
|
MATCH (dl:DataLabel)
|
|
|
- WHERE dl.category CONTAINS '人才' OR dl.category CONTAINS 'talent'
|
|
|
- RETURN dl.name as name
|
|
|
+ WHERE dl.category CONTAINS '人才地图' OR dl.category CONTAINS 'talentmap'
|
|
|
+ RETURN dl.name_zh as name_zh, dl.name_en as name_en
|
|
|
"""
|
|
|
|
|
|
all_labels = []
|
|
|
with neo4j_driver.get_session() as session:
|
|
|
result = session.run(all_labels_query)
|
|
|
for record in result:
|
|
|
- all_labels.append(record['name'])
|
|
|
+ all_labels.append(record['name_zh'])
|
|
|
|
|
|
logging.info(f"获取到{len(all_labels)}个人才标签: {all_labels}")
|
|
|
|
|
@@ -1710,27 +1679,43 @@ def query_neo4j_graph(query_requirement):
|
|
|
|
|
|
# 构建匹配标签的提示语
|
|
|
matching_prompt = f"""
|
|
|
- 请分析以下查询需求,并从标签列表中找出与查询需求相关的标签。
|
|
|
-
|
|
|
- ## 查询需求
|
|
|
+ 请从上传的查询需求文本中提取以下结构化信息。其中datalabel字段从可用标签列表里进行匹配,匹配结果填写可用标签列表里的标签名称。需要严格按照JSON格式输出:
|
|
|
+ {{
|
|
|
+ "basic_info": {{
|
|
|
+ "中文姓名": "",
|
|
|
+ "英文姓名": "",
|
|
|
+ "手机号": "",
|
|
|
+ "固定电话": "",
|
|
|
+ "电子邮箱": "",
|
|
|
+ "生日": "",
|
|
|
+ "年龄": "",
|
|
|
+ "居住地": "",
|
|
|
+ "籍贯": ""
|
|
|
+ }},
|
|
|
+ "datalabel": [
|
|
|
+ "标签1","标签2","标签3"
|
|
|
+ ]
|
|
|
+ }}
|
|
|
+ ## 查询需求文本
|
|
|
{query_requirement}
|
|
|
|
|
|
## 可用标签列表
|
|
|
{labels_json}
|
|
|
|
|
|
- ## 输出要求
|
|
|
- 1. 请以JSON数组格式返回匹配的标签名称列表,格式如: ["标签1", "标签2", "标签3"]
|
|
|
- 2. 只返回标签名称数组,不要包含任何解释或其他文本
|
|
|
- 3. 如果没有找到匹配的标签,请返回空数组 []
|
|
|
+ 输出要求:
|
|
|
+ 1. 中文名称优先,有英文名称也要提取保留
|
|
|
+ 2. 年龄字段只需填写数字。
|
|
|
+ 3. 标签没有被匹配到,datalabel字段可以为空数组
|
|
|
+ 4. 只需返回JSON字符串,不要返回其他信息
|
|
|
"""
|
|
|
|
|
|
# 调用阿里千问API匹配标签
|
|
|
logging.info("发送请求到阿里千问API匹配标签:"+matching_prompt)
|
|
|
|
|
|
completion = client.chat.completions.create(
|
|
|
- model=model_name,
|
|
|
+ model="qwen-long-latest", # 使用qwen-long-latest模型
|
|
|
messages=[
|
|
|
- {"role": "system", "content": "你是一个专业的文本分析和匹配专家。"},
|
|
|
+ {"role": "system", "content": "你是一个专业的文本信息提取专家。"},
|
|
|
{"role": "user", "content": matching_prompt}
|
|
|
],
|
|
|
temperature=0.1,
|
|
@@ -1740,129 +1725,154 @@ def query_neo4j_graph(query_requirement):
|
|
|
# 解析API响应
|
|
|
matching_content = completion.choices[0].message.content
|
|
|
|
|
|
- # 提取JSON数组
|
|
|
- try:
|
|
|
- # 尝试直接解析返回结果,预期格式为 ["新开酒店经验", "五星级酒店", "总经理"]
|
|
|
- logging.info(f"阿里千问返回的匹配内容: {matching_content}")
|
|
|
-
|
|
|
- # 如果返回的是JSON字符串,先去除可能的前后缀文本
|
|
|
- if isinstance(matching_content, str):
|
|
|
- # 查找JSON数组的开始和结束位置
|
|
|
- start_idx = matching_content.find('[')
|
|
|
- end_idx = matching_content.rfind(']') + 1
|
|
|
-
|
|
|
- if start_idx >= 0 and end_idx > start_idx:
|
|
|
- json_str = matching_content[start_idx:end_idx]
|
|
|
- matched_labels = json.loads(json_str)
|
|
|
- else:
|
|
|
- matched_labels = []
|
|
|
- else:
|
|
|
- matched_labels = []
|
|
|
-
|
|
|
- # 确保结果是字符串列表
|
|
|
- if matched_labels and all(isinstance(item, str) for item in matched_labels):
|
|
|
- logging.info(f"成功解析到标签列表: {matched_labels}")
|
|
|
- else:
|
|
|
- logging.warning("解析结果不是预期的字符串列表格式,将使用空列表")
|
|
|
- matched_labels = []
|
|
|
- except json.JSONDecodeError as e:
|
|
|
- logging.error(f"JSON解析错误: {str(e)}")
|
|
|
- matched_labels = []
|
|
|
- except Exception as e:
|
|
|
- logging.error(f"解析匹配标签时出错: {str(e)}")
|
|
|
- matched_labels = []
|
|
|
+ # 直接解析JSON响应,提取datalabel字段
|
|
|
+ parsed_content = json.loads(matching_content)
|
|
|
+ matched_labels = parsed_content.get('datalabel', [])
|
|
|
|
|
|
logging.info(f"匹配到的标签: {matched_labels}")
|
|
|
|
|
|
- # 如果没有匹配到标签,返回空结果
|
|
|
- if not matched_labels:
|
|
|
- return {
|
|
|
- 'code': 200,
|
|
|
- 'success': True,
|
|
|
- 'message': '未找到与查询需求匹配的标签',
|
|
|
- 'query': '',
|
|
|
- 'data': []
|
|
|
- }
|
|
|
-
|
|
|
- # 步骤3: 构建Cypher生成提示文本
|
|
|
- logging.info("第三步:构建提示文本生成Cypher查询语句")
|
|
|
-
|
|
|
- # 将匹配的标签转换为字符串
|
|
|
- matched_labels_str = ", ".join([f"'{label}'" for label in matched_labels])
|
|
|
-
|
|
|
- # 构建生成Cypher的提示语
|
|
|
- cypher_prompt = f"""
|
|
|
- 请根据以下Neo4j图数据库结构和已匹配的标签,生成一个Cypher查询脚本。
|
|
|
-
|
|
|
- ## 图数据库结构
|
|
|
-
|
|
|
- ### 节点
|
|
|
- 1. Talent - 人才节点
|
|
|
- 属性: pg_id(PostgreSQL数据库ID), name_zh(中文姓名), name_en(英文姓名),
|
|
|
- mobile(手机号码), email(电子邮箱), updated_at(更新时间)
|
|
|
-
|
|
|
- 2. DataLabel - 人才标签节点
|
|
|
-
|
|
|
- ### 关系
|
|
|
- BELONGS_TO - 从属关系
|
|
|
- (Talent)-[BELONGS_TO]->(DataLabel) - 人才属于某标签
|
|
|
-
|
|
|
- ## 匹配的标签列表
|
|
|
- [{matched_labels_str}]
|
|
|
-
|
|
|
- ## 查询需求
|
|
|
- {query_requirement}
|
|
|
-
|
|
|
- ## 输出要求
|
|
|
- 1. 只输出有效的Cypher查询语句,不要包含任何解释或注释
|
|
|
- 2. 确保return语句中包含talent节点属性
|
|
|
- 3. 尽量利用图数据库的特性来优化查询效率
|
|
|
- 4. 使用WITH子句和COLLECT函数收集标签,确保查询到至少拥有一个标签的人才
|
|
|
-
|
|
|
- 注意:请直接返回Cypher查询语句,无需任何其他文本。
|
|
|
-
|
|
|
- 以下是一个示例:
|
|
|
- 假设匹配的标签是 ['五星级酒店', '新开酒店经验', '总经理']
|
|
|
-
|
|
|
- 生成的Cypher查询语句应该是:
|
|
|
- MATCH (t:Talent)-[:BELONGS_TO]->(dl:DataLabel)
|
|
|
- WHERE dl.name IN ['五星级酒店', '新开酒店经验', '总经理']
|
|
|
- WITH t, COLLECT(DISTINCT dl.name) AS labels
|
|
|
- WHERE size(labels) >= 1
|
|
|
- RETURN t.pg_id as pg_id, t.name_zh as name_zh, t.name_en as name_en, t.mobile as mobile, t.email as email, t.updated_at as updated_at
|
|
|
- """
|
|
|
-
|
|
|
- # 调用阿里千问API生成Cypher脚本
|
|
|
- logging.info("发送请求到阿里千问API生成Cypher脚本")
|
|
|
-
|
|
|
- completion = client.chat.completions.create(
|
|
|
- model=model_name,
|
|
|
- messages=[
|
|
|
- {"role": "system", "content": "你是一个专业的Neo4j Cypher查询专家。"},
|
|
|
- {"role": "user", "content": cypher_prompt}
|
|
|
- ],
|
|
|
- temperature=0.1
|
|
|
- )
|
|
|
-
|
|
|
- # 解析API响应
|
|
|
- cypher_script = completion.choices[0].message.content
|
|
|
-
|
|
|
- # 清理Cypher脚本,移除不必要的markdown格式或注释
|
|
|
- cypher_script = cypher_script.strip()
|
|
|
- if cypher_script.startswith("```cypher"):
|
|
|
- cypher_script = cypher_script[9:]
|
|
|
- elif cypher_script.startswith("```"):
|
|
|
- cypher_script = cypher_script[3:]
|
|
|
- if cypher_script.endswith("```"):
|
|
|
- cypher_script = cypher_script[:-3]
|
|
|
- cypher_script = cypher_script.strip()
|
|
|
+ # 步骤3: 构建查询逻辑和Cypher语句
|
|
|
+ logging.info("第三步:构建查询逻辑和Cypher语句")
|
|
|
+
|
|
|
+ # 提取basic_info中的非空字段
|
|
|
+ basic_info = parsed_content.get('basic_info', {})
|
|
|
+ non_empty_fields = {k: v for k, v in basic_info.items() if v and str(v).strip()}
|
|
|
+
|
|
|
+ logging.info(f"提取到的非空字段: {non_empty_fields}")
|
|
|
+
|
|
|
+ # 构建Talent节点子集查询
|
|
|
+ talent_conditions = []
|
|
|
+ talent_params = {}
|
|
|
+
|
|
|
+ if non_empty_fields:
|
|
|
+ # 如果有非空字段,构建Talent节点属性匹配条件
|
|
|
+ for field, value in non_empty_fields.items():
|
|
|
+ if field == "中文姓名":
|
|
|
+ talent_conditions.append("t.name_zh CONTAINS $name_zh")
|
|
|
+ talent_params['name_zh'] = value
|
|
|
+ elif field == "英文姓名":
|
|
|
+ talent_conditions.append("t.name_en CONTAINS $name_en")
|
|
|
+ talent_params['name_en'] = value
|
|
|
+ elif field == "手机号":
|
|
|
+ talent_conditions.append("t.mobile CONTAINS $mobile")
|
|
|
+ talent_params['mobile'] = value
|
|
|
+ elif field == "固定电话":
|
|
|
+ talent_conditions.append("t.phone CONTAINS $phone")
|
|
|
+ talent_params['phone'] = value
|
|
|
+ elif field == "电子邮箱":
|
|
|
+ talent_conditions.append("t.email CONTAINS $email")
|
|
|
+ talent_params['email'] = value
|
|
|
+ elif field == "生日":
|
|
|
+ # 格式化生日为YYYY-MM-DD格式
|
|
|
+ try:
|
|
|
+ from datetime import datetime
|
|
|
+ # 尝试解析各种可能的日期格式
|
|
|
+ if isinstance(value, str):
|
|
|
+ # 处理常见的日期格式
|
|
|
+ if len(value) == 8 and value.isdigit(): # YYYYMMDD
|
|
|
+ formatted_birthday = f"{value[:4]}-{value[4:6]}-{value[6:8]}"
|
|
|
+ elif len(value) == 10 and value.count('-') == 2: # YYYY-MM-DD
|
|
|
+ formatted_birthday = value
|
|
|
+ elif len(value) == 10 and value.count('/') == 2: # YYYY/MM/DD
|
|
|
+ date_obj = datetime.strptime(value, '%Y/%m/%d')
|
|
|
+ formatted_birthday = date_obj.strftime('%Y-%m-%d')
|
|
|
+ else:
|
|
|
+ # 尝试其他常见格式
|
|
|
+ for fmt in ['%Y-%m-%d', '%Y/%m/%d', '%Y.%m.%d', '%Y年%m月%d日']:
|
|
|
+ try:
|
|
|
+ date_obj = datetime.strptime(value, fmt)
|
|
|
+ formatted_birthday = date_obj.strftime('%Y-%m-%d')
|
|
|
+ break
|
|
|
+ except ValueError:
|
|
|
+ continue
|
|
|
+ else:
|
|
|
+ # 如果所有格式都失败,使用原始值
|
|
|
+ formatted_birthday = value
|
|
|
+ else:
|
|
|
+ formatted_birthday = str(value)
|
|
|
+
|
|
|
+ talent_conditions.append("t.birthday = $birthday")
|
|
|
+ talent_params['birthday'] = formatted_birthday
|
|
|
+ logging.info(f"生日字段格式化: {value} -> {formatted_birthday}")
|
|
|
+ except Exception as e:
|
|
|
+ logging.warning(f"生日字段格式化失败: {value}, 错误: {str(e)}")
|
|
|
+ # 如果格式化失败,使用原始值
|
|
|
+ talent_conditions.append("t.birthday = $birthday")
|
|
|
+ talent_params['birthday'] = value
|
|
|
+ elif field == "年龄":
|
|
|
+ talent_conditions.append("t.age = $age")
|
|
|
+ talent_params['age'] = int(value) if value.isdigit() else 0
|
|
|
+ elif field == "居住地":
|
|
|
+ talent_conditions.append("t.residence CONTAINS $residence")
|
|
|
+ talent_params['residence'] = value
|
|
|
+ elif field == "籍贯":
|
|
|
+ talent_conditions.append("t.origin CONTAINS $origin")
|
|
|
+ talent_params['origin'] = value
|
|
|
+
|
|
|
+ # 构建Talent子集查询
|
|
|
+ if talent_conditions:
|
|
|
+ talent_subset_query = f"""
|
|
|
+ MATCH (t:Talent)
|
|
|
+ WHERE {' AND '.join(talent_conditions)}
|
|
|
+ WITH t
|
|
|
+ """
|
|
|
+ logging.info("构建Talent子集查询条件")
|
|
|
+ else:
|
|
|
+ talent_subset_query = """
|
|
|
+ MATCH (t:Talent)
|
|
|
+ WITH t
|
|
|
+ """
|
|
|
+ logging.info("使用所有Talent节点")
|
|
|
+
|
|
|
+ # 构建条件子集查询(DataLabel节点和Hotel节点)
|
|
|
+ condition_params = {}
|
|
|
+
|
|
|
+ if matched_labels:
|
|
|
+ condition_params['labels'] = matched_labels
|
|
|
+ logging.info(f"构建DataLabel和Hotel条件查询,标签: {matched_labels}")
|
|
|
+
|
|
|
+ # 步骤4: 执行查询并返回结果
|
|
|
+ logging.info("第四步:执行查询并返回结果")
|
|
|
+
|
|
|
+ # 构建完整的Cypher查询语句
|
|
|
+ if matched_labels:
|
|
|
+ # 有标签条件的情况 - 查找与条件子集(DataLabel和Hotel)有关系的Talent节点
|
|
|
+ # 使用OR逻辑:Talent有WORK_FOR关系链路或者有BELONGS_TO关系链路的节点都可以查询出来
|
|
|
+ cypher_script = f"""
|
|
|
+ {talent_subset_query}
|
|
|
+ WHERE EXISTS {{
|
|
|
+ // 条件1:存在WORK_FOR关系链路
|
|
|
+ MATCH (t)-[:WORK_FOR]->(:Hotel)-[:HAS_LABEL]->(dl:DataLabel)
|
|
|
+ WHERE dl.name_zh IN $labels
|
|
|
+ }} OR EXISTS {{
|
|
|
+ // 条件2:存在BELONGS_TO关系链路
|
|
|
+ MATCH (t)-[:BELONGS_TO]->(dl2:DataLabel)
|
|
|
+ WHERE dl2.name_zh IN $labels
|
|
|
+ }}
|
|
|
+ RETURN DISTINCT
|
|
|
+ t.pg_id AS pg_id,
|
|
|
+ t.name_zh AS name_zh,
|
|
|
+ t.name_en AS name_en,
|
|
|
+ t.mobile AS mobile,
|
|
|
+ t.email AS email,
|
|
|
+ t.updated_at AS updated_at
|
|
|
+ """
|
|
|
+ else:
|
|
|
+ # 无标签条件的情况,只根据Talent属性查询
|
|
|
+ cypher_script = f"""
|
|
|
+ {talent_subset_query}
|
|
|
+ RETURN DISTINCT t.pg_id as pg_id, t.name_zh as name_zh, t.name_en as name_en,
|
|
|
+ t.mobile as mobile, t.email as email, t.updated_at as updated_at
|
|
|
+ """
|
|
|
|
|
|
logging.info(f"生成的Cypher脚本: {cypher_script}")
|
|
|
|
|
|
- # 步骤4: 执行Cypher脚本
|
|
|
- logging.info("第四步:执行Cypher脚本并返回结果")
|
|
|
+ # 合并所有参数
|
|
|
+ all_params = {**talent_params, **condition_params}
|
|
|
+
|
|
|
+ # 执行查询
|
|
|
with neo4j_driver.get_session() as session:
|
|
|
- result = session.run(cypher_script)
|
|
|
+ result = session.run(cypher_script, **all_params)
|
|
|
records = [record.data() for record in result]
|
|
|
|
|
|
# 构建查询结果
|
|
@@ -1872,6 +1882,7 @@ def query_neo4j_graph(query_requirement):
|
|
|
'message': '查询成功执行',
|
|
|
'query': cypher_script,
|
|
|
'matched_labels': matched_labels,
|
|
|
+ 'non_empty_fields': non_empty_fields,
|
|
|
'data': records
|
|
|
}
|
|
|
|