parse.py 66 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846
  1. from typing import Dict, Any
  2. from app import db
  3. from datetime import datetime
  4. import os
  5. import boto3
  6. from botocore.config import Config
  7. import logging
  8. import requests
  9. import json
  10. import re
  11. import uuid
  12. from PIL import Image
  13. from io import BytesIO
  14. import pytesseract
  15. import base64
  16. from openai import OpenAI
  17. from app.config.config import DevelopmentConfig, ProductionConfig
  18. # 测试用的解析数据接口。没有实际使用。
  19. def parse_data(data: Dict[str, Any]) -> Dict[str, Any]:
  20. """
  21. 解析数据的主函数
  22. Args:
  23. data: 要解析的数据
  24. Returns:
  25. 解析后的数据
  26. """
  27. # TODO: 实现数据解析逻辑
  28. return {
  29. 'code': 200,
  30. 'status': 'success',
  31. 'message': 'Data parsed successfully',
  32. 'data': data
  33. }
  34. # 名片解析数据模型
  35. class BusinessCard(db.Model):
  36. __tablename__ = 'business_cards'
  37. id = db.Column(db.Integer, primary_key=True, autoincrement=True)
  38. name_zh = db.Column(db.String(100), nullable=False)
  39. name_en = db.Column(db.String(100))
  40. title_zh = db.Column(db.String(100))
  41. title_en = db.Column(db.String(100))
  42. mobile = db.Column(db.String(50))
  43. phone = db.Column(db.String(50))
  44. email = db.Column(db.String(100))
  45. hotel_zh = db.Column(db.String(200))
  46. hotel_en = db.Column(db.String(200))
  47. address_zh = db.Column(db.Text)
  48. address_en = db.Column(db.Text)
  49. postal_code_zh = db.Column(db.String(20))
  50. postal_code_en = db.Column(db.String(20))
  51. brand_zh = db.Column(db.String(100))
  52. brand_en = db.Column(db.String(100))
  53. affiliation_zh = db.Column(db.String(200))
  54. affiliation_en = db.Column(db.String(200))
  55. image_path = db.Column(db.String(255)) # MinIO中存储的路径
  56. career_path = db.Column(db.JSON) # 职业轨迹,JSON格式
  57. brand_group = db.Column(db.String(200)) # 品牌组合
  58. created_at = db.Column(db.DateTime, default=datetime.now, nullable=False)
  59. updated_at = db.Column(db.DateTime, onupdate=datetime.now)
  60. updated_by = db.Column(db.String(50))
  61. status = db.Column(db.String(20), default='active')
  62. def to_dict(self):
  63. return {
  64. 'id': self.id,
  65. 'name_zh': self.name_zh,
  66. 'name_en': self.name_en,
  67. 'title_zh': self.title_zh,
  68. 'title_en': self.title_en,
  69. 'mobile': self.mobile,
  70. 'phone': self.phone,
  71. 'email': self.email,
  72. 'hotel_zh': self.hotel_zh,
  73. 'hotel_en': self.hotel_en,
  74. 'address_zh': self.address_zh,
  75. 'address_en': self.address_en,
  76. 'postal_code_zh': self.postal_code_zh,
  77. 'postal_code_en': self.postal_code_en,
  78. 'brand_zh': self.brand_zh,
  79. 'brand_en': self.brand_en,
  80. 'affiliation_zh': self.affiliation_zh,
  81. 'affiliation_en': self.affiliation_en,
  82. 'image_path': self.image_path,
  83. 'career_path': self.career_path,
  84. 'brand_group': self.brand_group,
  85. 'created_at': self.created_at.strftime('%Y-%m-%d %H:%M:%S') if self.created_at else None,
  86. 'updated_at': self.updated_at.strftime('%Y-%m-%d %H:%M:%S') if self.updated_at else None,
  87. 'updated_by': self.updated_by,
  88. 'status': self.status
  89. }
  90. # 名片解析功能模块
  91. # DeepSeek API配置
  92. DEEPSEEK_API_KEY = os.environ.get('DEEPSEEK_API_KEY', 'sk-2aea6e8b159b448aa3c1e29acd6f4349')
  93. DEEPSEEK_API_URL = os.environ.get('DEEPSEEK_API_URL', 'https://api.deepseek.com/v1/chat/completions')
  94. # 备用API端点
  95. DEEPSEEK_API_URL_BACKUP = 'https://api.deepseek.com/v1/completions'
  96. # OCR配置
  97. # 设置pytesseract路径(如果需要)
  98. # pytesseract.pytesseract.tesseract_cmd = r'/path/to/tesseract'
  99. # OCR语言设置,支持多语言
  100. OCR_LANG = os.environ.get('OCR_LANG', 'chi_sim+eng')
  101. # 根据环境选择配置
  102. """
  103. if os.environ.get('FLASK_ENV') == 'production':
  104. config = ProductionConfig()
  105. else:
  106. config = DevelopmentConfig()
  107. """
  108. # 使用配置变量,缺省认为在生产环境运行
  109. config = ProductionConfig()
  110. # 使用配置变量
  111. minio_url = f"{'https' if config.MINIO_SECURE else 'http'}://{config.MINIO_HOST}"
  112. minio_access_key = config.MINIO_USER
  113. minio_secret_key = config.MINIO_PASSWORD
  114. minio_bucket = config.MINIO_BUCKET
  115. use_ssl = config.MINIO_SECURE
  116. def get_minio_client():
  117. """获取MinIO客户端连接"""
  118. try:
  119. # 使用全局配置变量
  120. global minio_url, minio_access_key, minio_secret_key, minio_bucket, use_ssl
  121. logging.info(f"尝试连接MinIO服务器: {minio_url}")
  122. minio_client = boto3.client(
  123. 's3',
  124. endpoint_url=minio_url,
  125. aws_access_key_id=minio_access_key,
  126. aws_secret_access_key=minio_secret_key,
  127. config=Config(
  128. signature_version='s3v4',
  129. retries={'max_attempts': 3, 'mode': 'standard'},
  130. connect_timeout=10,
  131. read_timeout=30
  132. )
  133. )
  134. # 确保存储桶存在
  135. buckets = minio_client.list_buckets()
  136. bucket_names = [bucket['Name'] for bucket in buckets.get('Buckets', [])]
  137. logging.info(f"成功连接到MinIO服务器,现有存储桶: {bucket_names}")
  138. if minio_bucket not in bucket_names:
  139. logging.info(f"创建存储桶: {minio_bucket}")
  140. minio_client.create_bucket(Bucket=minio_bucket)
  141. return minio_client
  142. except Exception as e:
  143. logging.error(f"MinIO连接错误: {str(e)}")
  144. return None
  145. def extract_text_from_image(image_data):
  146. """
  147. 使用OCR从图像中提取文本,然后通过DeepSeek API解析名片信息
  148. Args:
  149. image_data (bytes): 图像的二进制数据
  150. Returns:
  151. dict: 提取的信息(姓名、职位、公司等)
  152. Raises:
  153. Exception: 当OCR或API调用失败或配置错误时抛出异常
  154. """
  155. try:
  156. # 步骤1: 使用OCR从图像中提取文本
  157. ocr_text = ocr_extract_text(image_data)
  158. if not ocr_text or ocr_text.strip() == "":
  159. error_msg = "OCR无法从图像中提取文本"
  160. logging.error(error_msg)
  161. raise Exception(error_msg)
  162. logging.info(f"OCR提取的文本: {ocr_text[:200]}..." if len(ocr_text) > 200 else ocr_text)
  163. # 步骤2: 使用DeepSeek API解析文本中的信息
  164. return parse_text_with_deepseek(ocr_text)
  165. except Exception as e:
  166. error_msg = f"从图像中提取和解析文本失败: {str(e)}"
  167. logging.error(error_msg, exc_info=True)
  168. raise Exception(error_msg)
  169. def ocr_extract_text(image_data):
  170. """
  171. 使用OCR从图像中提取文本
  172. Args:
  173. image_data (bytes): 图像的二进制数据
  174. Returns:
  175. str: 提取的文本
  176. """
  177. try:
  178. # 将二进制数据转换为PIL图像
  179. image = Image.open(BytesIO(image_data))
  180. # 使用pytesseract进行OCR文本提取
  181. text = pytesseract.image_to_string(image, lang=OCR_LANG)
  182. # 清理提取的文本
  183. text = text.strip()
  184. logging.info(f"OCR成功从图像中提取文本,长度: {len(text)}")
  185. print(text)
  186. return text
  187. except Exception as e:
  188. error_msg = f"OCR提取文本失败: {str(e)}"
  189. logging.error(error_msg, exc_info=True)
  190. raise Exception(error_msg)
  191. def parse_text_with_deepseek(text):
  192. """
  193. 使用DeepSeek API解析文本中的名片信息
  194. Args:
  195. text (str): 要解析的文本
  196. Returns:
  197. dict: 解析的名片信息
  198. """
  199. # 准备请求DeepSeek API
  200. if not DEEPSEEK_API_KEY:
  201. error_msg = "未配置DeepSeek API密钥"
  202. logging.error(error_msg)
  203. raise Exception(error_msg)
  204. # 构建API请求的基本信息
  205. headers = {
  206. "Authorization": f"Bearer {DEEPSEEK_API_KEY}",
  207. "Content-Type": "application/json"
  208. }
  209. # 构建提示语,包含OCR提取的文本
  210. prompt = f"""请从以下名片文本中提取详细信息,需分别识别中英文内容。
  211. 以JSON格式返回,包含以下字段:
  212. - name_zh: 中文姓名
  213. - name_en: 英文姓名
  214. - title_zh: 中文职位/头衔
  215. - title_en: 英文职位/头衔
  216. - hotel_zh: 中文酒店/公司名称
  217. - hotel_en: 英文酒店/公司名称
  218. - mobile: 手机号码
  219. - phone: 固定电话
  220. - email: 电子邮箱
  221. - address_zh: 中文地址
  222. - address_en: 英文地址
  223. - brand_group: 品牌组合(如有多个品牌,以逗号分隔)
  224. - career_path: 职业轨迹(如果能从文本中推断出,以JSON数组格式返回,包含公司名称和职位)
  225. 名片文本:
  226. {text}
  227. """
  228. # 使用模型名称
  229. model_name = 'deepseek-chat'
  230. try:
  231. # 尝试调用DeepSeek API
  232. logging.info(f"尝试通过DeepSeek API解析文本")
  233. payload = {
  234. "model": model_name,
  235. "messages": [
  236. {"role": "system", "content": "你是一个专业的名片信息提取助手。请用JSON格式返回结果,不要有多余的文字说明。"},
  237. {"role": "user", "content": prompt}
  238. ],
  239. "temperature": 0.1
  240. }
  241. logging.info(f"向DeepSeek API发送请求")
  242. response = requests.post(DEEPSEEK_API_URL, headers=headers, json=payload, timeout=30)
  243. # 检查响应状态
  244. response.raise_for_status()
  245. # 解析API响应
  246. result = response.json()
  247. content = result.get("choices", [{}])[0].get("message", {}).get("content", "{}")
  248. # 尝试解析JSON内容
  249. try:
  250. # 找到内容中的JSON部分(有时模型会在JSON前后添加额外文本)
  251. json_content = extract_json_from_text(content)
  252. extracted_data = json.loads(json_content)
  253. logging.info(f"成功解析DeepSeek API返回的JSON")
  254. except json.JSONDecodeError:
  255. logging.warning(f"无法解析JSON,尝试直接从文本提取信息")
  256. # 如果无法解析JSON,尝试直接从文本中提取关键信息
  257. extracted_data = extract_fields_from_text(content)
  258. # 确保所有必要的字段都存在
  259. required_fields = ['name', 'title', 'company', 'phone', 'email', 'address', 'brand_group', 'career_path']
  260. for field in required_fields:
  261. if field not in extracted_data:
  262. extracted_data[field] = "" if field != 'career_path' else []
  263. logging.info(f"成功从DeepSeek API获取解析结果")
  264. return extracted_data
  265. except requests.exceptions.HTTPError as e:
  266. error_msg = f"DeepSeek API调用失败: {str(e)}"
  267. logging.error(error_msg)
  268. if hasattr(e, 'response') and e.response:
  269. logging.error(f"错误状态码: {e.response.status_code}")
  270. logging.error(f"错误内容: {e.response.text}")
  271. raise Exception(error_msg)
  272. except Exception as e:
  273. error_msg = f"解析文本过程中发生错误: {str(e)}"
  274. logging.error(error_msg, exc_info=True)
  275. raise Exception(error_msg)
  276. def extract_json_from_text(text):
  277. """
  278. 从文本中提取JSON部分
  279. Args:
  280. text (str): 包含JSON的文本
  281. Returns:
  282. str: 提取的JSON字符串
  283. """
  284. # 尝试找到最外层的花括号对
  285. start_idx = text.find('{')
  286. if start_idx == -1:
  287. return "{}"
  288. # 使用简单的括号匹配算法找到对应的闭合括号
  289. count = 0
  290. for i in range(start_idx, len(text)):
  291. if text[i] == '{':
  292. count += 1
  293. elif text[i] == '}':
  294. count -= 1
  295. if count == 0:
  296. return text[start_idx:i+1]
  297. # 如果没有找到闭合括号,返回从开始位置到文本结尾
  298. return text[start_idx:]
  299. def extract_fields_from_text(text):
  300. """
  301. 从文本中直接提取名片字段信息
  302. Args:
  303. text (str): 要分析的文本
  304. Returns:
  305. dict: 提取的字段
  306. """
  307. # 初始化结果字典
  308. result = {
  309. 'name_zh': '',
  310. 'name_en': '',
  311. 'title_zh': '',
  312. 'title_en': '',
  313. 'mobile': '',
  314. 'phone': '',
  315. 'email': '',
  316. 'hotel_zh': '',
  317. 'hotel_en': '',
  318. 'address_zh': '',
  319. 'address_en': '',
  320. 'postal_code_zh': '',
  321. 'postal_code_en': '',
  322. 'brand_zh': '',
  323. 'brand_en': '',
  324. 'affiliation_zh': '',
  325. 'affiliation_en': ''
  326. }
  327. # 提取中文姓名
  328. name_zh_match = re.search(r'["\'](姓名)["\'][\s\{:]*["\']?(中文)["\']?[\s\}:]*["\']([^"\']+)["\']', text, re.IGNORECASE)
  329. if name_zh_match:
  330. result['name_zh'] = name_zh_match.group(3)
  331. # 提取英文姓名
  332. name_en_match = re.search(r'["\'](姓名)["\'][\s\{:]*["\']?(英文)["\']?[\s\}:]*["\']([^"\']+)["\']', text, re.IGNORECASE)
  333. if name_en_match:
  334. result['name_en'] = name_en_match.group(3)
  335. # 提取中文头衔
  336. title_zh_match = re.search(r'["\'](头衔|职位)["\'][\s\{:]*["\']?(中文)["\']?[\s\}:]*["\']([^"\']+)["\']', text, re.IGNORECASE)
  337. if title_zh_match:
  338. result['title_zh'] = title_zh_match.group(3)
  339. # 提取英文头衔
  340. title_en_match = re.search(r'["\'](头衔|职位)["\'][\s\{:]*["\']?(英文)["\']?[\s\}:]*["\']([^"\']+)["\']', text, re.IGNORECASE)
  341. if title_en_match:
  342. result['title_en'] = title_en_match.group(3)
  343. # 提取手机
  344. mobile_match = re.search(r'["\'](手机)["\'][\s:]*["\']([^"\']+)["\']', text, re.IGNORECASE)
  345. if mobile_match:
  346. result['mobile'] = mobile_match.group(2)
  347. # 提取电话
  348. phone_match = re.search(r'["\'](电话)["\'][\s:]*["\']([^"\']+)["\']', text, re.IGNORECASE)
  349. if phone_match:
  350. result['phone'] = phone_match.group(2)
  351. # 提取邮箱
  352. email_match = re.search(r'["\'](邮箱)["\'][\s:]*["\']([^"\']+)["\']', text, re.IGNORECASE)
  353. if email_match:
  354. result['email'] = email_match.group(2)
  355. # 提取中文酒店名称
  356. hotel_zh_match = re.search(r'["\'](地址)["\'][\s\{:]*["\']?(中文)["\']?[\s\{:]*["\']?(酒店名称)["\']?[\s\}:]*["\']([^"\']+)["\']', text, re.IGNORECASE)
  357. if hotel_zh_match:
  358. result['hotel_zh'] = hotel_zh_match.group(4)
  359. # 提取英文酒店名称
  360. hotel_en_match = re.search(r'["\'](地址)["\'][\s\{:]*["\']?(英文)["\']?[\s\{:]*["\']?(酒店名称)["\']?[\s\}:]*["\']([^"\']+)["\']', text, re.IGNORECASE)
  361. if hotel_en_match:
  362. result['hotel_en'] = hotel_en_match.group(4)
  363. # 提取中文详细地址
  364. address_zh_match = re.search(r'["\'](地址)["\'][\s\{:]*["\']?(中文)["\']?[\s\{:]*["\']?(详细地址)["\']?[\s\}:]*["\']([^"\']+)["\']', text, re.IGNORECASE)
  365. if address_zh_match:
  366. result['address_zh'] = address_zh_match.group(4)
  367. # 提取英文详细地址
  368. address_en_match = re.search(r'["\'](地址)["\'][\s\{:]*["\']?(英文)["\']?[\s\{:]*["\']?(详细地址)["\']?[\s\}:]*["\']([^"\']+)["\']', text, re.IGNORECASE)
  369. if address_en_match:
  370. result['address_en'] = address_en_match.group(4)
  371. # 提取中文邮政编码
  372. postal_code_zh_match = re.search(r'["\'](地址)["\'][\s\{:]*["\']?(中文)["\']?[\s\{:]*["\']?(邮政编码)["\']?[\s\}:]*["\']([^"\']+)["\']', text, re.IGNORECASE)
  373. if postal_code_zh_match:
  374. result['postal_code_zh'] = postal_code_zh_match.group(4)
  375. # 提取英文邮政编码
  376. postal_code_en_match = re.search(r'["\'](地址)["\'][\s\{:]*["\']?(英文)["\']?[\s\{:]*["\']?(邮政编码)["\']?[\s\}:]*["\']([^"\']+)["\']', text, re.IGNORECASE)
  377. if postal_code_en_match:
  378. result['postal_code_en'] = postal_code_en_match.group(4)
  379. # 提取中文品牌名称
  380. brand_zh_match = re.search(r'["\'](公司)["\'][\s\{:]*["\']?(中文)["\']?[\s\{:]*["\']?(品牌名称)["\']?[\s\}:]*["\']([^"\']+)["\']', text, re.IGNORECASE)
  381. if brand_zh_match:
  382. result['brand_zh'] = brand_zh_match.group(4)
  383. # 提取英文品牌名称
  384. brand_en_match = re.search(r'["\'](公司)["\'][\s\{:]*["\']?(英文)["\']?[\s\{:]*["\']?(品牌名称)["\']?[\s\}:]*["\']([^"\']+)["\']', text, re.IGNORECASE)
  385. if brand_en_match:
  386. result['brand_en'] = brand_en_match.group(4)
  387. # 提取中文隶属关系
  388. affiliation_zh_match = re.search(r'["\'](公司)["\'][\s\{:]*["\']?(中文)["\']?[\s\{:]*["\']?(隶属关系)["\']?[\s\}:]*["\']([^"\']+)["\']', text, re.IGNORECASE)
  389. if affiliation_zh_match:
  390. result['affiliation_zh'] = affiliation_zh_match.group(4)
  391. # 提取英文隶属关系
  392. affiliation_en_match = re.search(r'["\'](公司)["\'][\s\{:]*["\']?(英文)["\']?[\s\{:]*["\']?(隶属关系)["\']?[\s\}:]*["\']([^"\']+)["\']', text, re.IGNORECASE)
  393. if affiliation_en_match:
  394. result['affiliation_en'] = affiliation_en_match.group(4)
  395. return result
  396. def parse_text_with_qwen25VLplus(image_data):
  397. """
  398. 使用阿里云的 Qwen 2.5 VL Plus 模型解析图像中的名片信息
  399. Args:
  400. image_data (bytes): 图像的二进制数据
  401. Returns:
  402. dict: 解析的名片信息
  403. """
  404. # 阿里云 Qwen API 配置
  405. QWEN_API_KEY = os.environ.get('QWEN_API_KEY', 'sk-8f2320dafc9e4076968accdd8eebd8e9')
  406. try:
  407. # 将图片数据转为 base64 编码
  408. base64_image = base64.b64encode(image_data).decode('utf-8')
  409. # 初始化 OpenAI 客户端,配置为阿里云 API
  410. client = OpenAI(
  411. api_key=QWEN_API_KEY,
  412. base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
  413. )
  414. # 构建优化后的提示语
  415. prompt = """你是企业名片的信息提取专家。请仔细分析提供的名片,精确提取以下信息:
  416. ## 提取要求
  417. - 区分中英文内容,分别提取
  418. - 保持提取信息的原始格式(如大小写、标点)
  419. - 对于无法识别或名片中不存在的信息,返回空字符串
  420. - 名片中没有的信息,请不要猜测
  421. ## 需提取的字段
  422. 1. 中文姓名 (name_zh)
  423. 2. 英文姓名 (name_en)
  424. 3. 中文职位/头衔 (title_zh)
  425. 4. 英文职位/头衔 (title_en)
  426. 5. 中文酒店/公司名称 (hotel_zh)
  427. 6. 英文酒店/公司名称 (hotel_en)
  428. 7. 手机号码 (mobile) - 如有多个,使用逗号分隔
  429. 8. 固定电话 (phone) - 如有多个,使用逗号分隔
  430. 9. 电子邮箱 (email)
  431. 10. 中文地址 (address_zh)
  432. 11. 英文地址 (address_en)
  433. 12. 中文邮政编码 (postal_code_zh)
  434. 13. 英文邮政编码 (postal_code_en)
  435. 14. 品牌组合 (brand_group) - 如有多个品牌,使用逗号分隔
  436. 15. 职业轨迹 (career_path) - 如能从名片中推断,以JSON数组格式返回,包含当前日期,公司名称和职位
  437. 16. 隶属关系 (affiliation) - 如能从名片中推断,以JSON数组格式返回,包含公司名称和隶属集团名称
  438. ## 输出格式
  439. 请以严格的JSON格式返回结果,不要添加任何额外解释文字。JSON格式如下:
  440. ```json
  441. {
  442. "name_zh": "",
  443. "name_en": "",
  444. "title_zh": "",
  445. "title_en": "",
  446. "hotel_zh": "",
  447. "hotel_en": "",
  448. "mobile": "",
  449. "phone": "",
  450. "email": "",
  451. "address_zh": "",
  452. "address_en": "",
  453. "postal_code_zh": "",
  454. "postal_code_en": "",
  455. "brand_group": "",
  456. "career_path": [],
  457. "affiliation": []
  458. }
  459. ```"""
  460. # 调用 Qwen 2.5 VL Plus API
  461. logging.info("发送请求到 Qwen 2.5 VL Plus 模型")
  462. completion = client.chat.completions.create(
  463. model="qwen-vl-plus",
  464. messages=[
  465. {
  466. "role": "user",
  467. "content": [
  468. {"type": "text", "text": prompt},
  469. {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
  470. ]
  471. }
  472. ],
  473. temperature=0.1, # 降低温度增加精确性
  474. response_format={"type": "json_object"} # 要求输出JSON格式
  475. )
  476. # 解析响应
  477. response_content = completion.choices[0].message.content
  478. logging.info(f"成功从 Qwen 模型获取响应: {response_content}")
  479. # 尝试从响应中提取 JSON
  480. try:
  481. json_content = extract_json_from_text(response_content)
  482. extracted_data = json.loads(json_content)
  483. logging.info("成功解析 Qwen 响应中的 JSON")
  484. except json.JSONDecodeError:
  485. logging.warning("无法解析 JSON,尝试从文本中提取信息")
  486. extracted_data = extract_fields_from_text(response_content)
  487. # 确保所有必要字段存在
  488. required_fields = [
  489. 'name_zh', 'name_en', 'title_zh', 'title_en',
  490. 'hotel_zh', 'hotel_en', 'mobile', 'phone',
  491. 'email', 'address_zh', 'address_en',
  492. 'postal_code_zh', 'postal_code_en', 'brand_group', 'career_path'
  493. ]
  494. for field in required_fields:
  495. if field not in extracted_data:
  496. extracted_data[field] = [] if field == 'career_path' else ""
  497. return extracted_data
  498. except Exception as e:
  499. error_msg = f"Qwen 2.5 VL Plus 模型解析失败: {str(e)}"
  500. logging.error(error_msg, exc_info=True)
  501. raise Exception(error_msg)
  502. def process_business_card(image_file):
  503. """
  504. 处理名片图片并提取信息
  505. Args:
  506. image_file (FileStorage): 上传的名片图片文件
  507. Returns:
  508. dict: 处理结果,包含提取的信息和状态
  509. """
  510. minio_path = None
  511. try:
  512. # 读取图片数据
  513. image_data = image_file.read()
  514. image_file.seek(0) # 重置文件指针以便后续读取
  515. try:
  516. # 优先使用 Qwen 2.5 VL Plus 模型直接从图像提取信息
  517. try:
  518. logging.info("尝试使用 Qwen 2.5 VL Plus 模型解析名片")
  519. extracted_data = parse_text_with_qwen25VLplus(image_data)
  520. logging.info("成功使用 Qwen 2.5 VL Plus 模型解析名片")
  521. except Exception as qwen_error:
  522. logging.warning(f"Qwen 模型解析失败,错误原因: {str(qwen_error)}")
  523. # extracted_data = extract_text_from_image(image_data)
  524. except Exception as e:
  525. return {
  526. 'code': 500,
  527. 'success': False,
  528. 'message': f"名片解析失败: {str(e)}",
  529. 'data': None
  530. }
  531. try:
  532. # 生成唯一的文件名
  533. file_ext = os.path.splitext(image_file.filename)[1].lower()
  534. if not file_ext:
  535. file_ext = '.jpg' # 默认扩展名
  536. unique_filename = f"{uuid.uuid4().hex}{file_ext}"
  537. minio_path = f"{unique_filename}"
  538. # 尝试上传到MinIO
  539. minio_client = get_minio_client()
  540. if minio_client:
  541. try:
  542. # 上传文件
  543. logging.info(f"上传文件到MinIO: {minio_path}")
  544. minio_client.put_object(
  545. Bucket=minio_bucket,
  546. Key=minio_path,
  547. Body=image_file,
  548. ContentType=image_file.content_type
  549. )
  550. logging.info(f"图片已上传到MinIO: {minio_path}")
  551. except Exception as upload_err:
  552. logging.error(f"上传文件到MinIO时出错: {str(upload_err)}")
  553. # 即使上传失败,仍继续处理,但路径为None
  554. minio_path = None
  555. else:
  556. minio_path = None
  557. logging.warning("MinIO客户端未初始化,图片未上传")
  558. except Exception as e:
  559. logging.error(f"上传图片到MinIO失败: {str(e)}", exc_info=True)
  560. minio_path = None
  561. try:
  562. # 保存到数据库
  563. business_card = BusinessCard(
  564. name_zh=extracted_data.get('name_zh', ''),
  565. name_en=extracted_data.get('name_en', ''),
  566. title_zh=extracted_data.get('title_zh', ''),
  567. title_en=extracted_data.get('title_en', ''),
  568. mobile=extracted_data.get('mobile', ''),
  569. phone=extracted_data.get('phone', ''),
  570. email=extracted_data.get('email', ''),
  571. hotel_zh=extracted_data.get('hotel_zh', ''),
  572. hotel_en=extracted_data.get('hotel_en', ''),
  573. address_zh=extracted_data.get('address_zh', ''),
  574. address_en=extracted_data.get('address_en', ''),
  575. postal_code_zh=extracted_data.get('postal_code_zh', ''),
  576. postal_code_en=extracted_data.get('postal_code_en', ''),
  577. brand_zh=extracted_data.get('brand_zh', ''),
  578. brand_en=extracted_data.get('brand_en', ''),
  579. affiliation_zh=extracted_data.get('affiliation_zh', ''),
  580. affiliation_en=extracted_data.get('affiliation_en', ''),
  581. image_path=minio_path, # 存储相对路径
  582. career_path=extracted_data.get('career_path', []), # 添加职业轨迹
  583. brand_group=extracted_data.get('brand_group', ''), # 添加品牌组合
  584. status='active',
  585. updated_by='system'
  586. )
  587. db.session.add(business_card)
  588. db.session.commit()
  589. logging.info(f"名片信息已保存到数据库,ID: {business_card.id}")
  590. return {
  591. 'code': 200,
  592. 'success': True,
  593. 'message': '名片解析成功',
  594. 'data': business_card.to_dict()
  595. }
  596. except Exception as e:
  597. db.session.rollback()
  598. error_msg = f"保存名片信息到数据库失败: {str(e)}"
  599. logging.error(error_msg, exc_info=True)
  600. # 即使数据库操作失败,仍返回提取的信息
  601. return {
  602. 'code': 500,
  603. 'success': False,
  604. 'message': error_msg,
  605. 'data': {
  606. 'id': None,
  607. 'name_zh': extracted_data.get('name_zh', ''),
  608. 'name_en': extracted_data.get('name_en', ''),
  609. 'title_zh': extracted_data.get('title_zh', ''),
  610. 'title_en': extracted_data.get('title_en', ''),
  611. 'mobile': extracted_data.get('mobile', ''),
  612. 'phone': extracted_data.get('phone', ''),
  613. 'email': extracted_data.get('email', ''),
  614. 'hotel_zh': extracted_data.get('hotel_zh', ''),
  615. 'hotel_en': extracted_data.get('hotel_en', ''),
  616. 'address_zh': extracted_data.get('address_zh', ''),
  617. 'address_en': extracted_data.get('address_en', ''),
  618. 'postal_code_zh': extracted_data.get('postal_code_zh', ''),
  619. 'postal_code_en': extracted_data.get('postal_code_en', ''),
  620. 'brand_zh': extracted_data.get('brand_zh', ''),
  621. 'brand_en': extracted_data.get('brand_en', ''),
  622. 'affiliation_zh': extracted_data.get('affiliation_zh', ''),
  623. 'affiliation_en': extracted_data.get('affiliation_en', ''),
  624. 'image_path': minio_path, # 返回相对路径
  625. 'career_path': extracted_data.get('career_path', []), # 添加职业轨迹
  626. 'brand_group': extracted_data.get('brand_group', ''), # 添加品牌组合
  627. 'created_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
  628. 'updated_at': None,
  629. 'updated_by': 'system',
  630. 'status': 'active'
  631. }
  632. }
  633. except Exception as e:
  634. db.session.rollback()
  635. error_msg = f"名片处理失败: {str(e)}"
  636. logging.error(error_msg, exc_info=True)
  637. return {
  638. 'code': 500,
  639. 'success': False,
  640. 'message': error_msg,
  641. 'data': None
  642. }
  643. def update_business_card(card_id, data):
  644. """
  645. 更新名片信息
  646. Args:
  647. card_id (int): 名片记录ID
  648. data (dict): 包含要更新的字段的字典
  649. Returns:
  650. dict: 包含操作结果和更新后的名片信息
  651. """
  652. try:
  653. # 查找要更新的名片记录
  654. card = BusinessCard.query.get(card_id)
  655. if not card:
  656. return {
  657. 'code': 500,
  658. 'success': False,
  659. 'message': f'未找到ID为{card_id}的名片记录',
  660. 'data': None
  661. }
  662. # 更新名片信息
  663. card.name_zh = data.get('name_zh', card.name_zh)
  664. card.name_en = data.get('name_en', card.name_en)
  665. card.title_zh = data.get('title_zh', card.title_zh)
  666. card.title_en = data.get('title_en', card.title_en)
  667. card.mobile = data.get('mobile', card.mobile)
  668. card.phone = data.get('phone', card.phone)
  669. card.email = data.get('email', card.email)
  670. card.hotel_zh = data.get('hotel_zh', card.hotel_zh)
  671. card.hotel_en = data.get('hotel_en', card.hotel_en)
  672. card.address_zh = data.get('address_zh', card.address_zh)
  673. card.address_en = data.get('address_en', card.address_en)
  674. card.postal_code_zh = data.get('postal_code_zh', card.postal_code_zh)
  675. card.postal_code_en = data.get('postal_code_en', card.postal_code_en)
  676. card.brand_zh = data.get('brand_zh', card.brand_zh)
  677. card.brand_en = data.get('brand_en', card.brand_en)
  678. card.affiliation_zh = data.get('affiliation_zh', card.affiliation_zh)
  679. card.affiliation_en = data.get('affiliation_en', card.affiliation_en)
  680. card.career_path = data.get('career_path', card.career_path) # 更新职业轨迹
  681. card.brand_group = data.get('brand_group', card.brand_group) # 更新品牌组合
  682. card.updated_by = data.get('updated_by', 'user') # 可以根据实际情况修改为当前用户
  683. # 保存更新
  684. db.session.commit()
  685. # 更新成功后,更新Neo4j图数据库中的人才-酒店关系
  686. try:
  687. from app.services.neo4j_driver import neo4j_driver
  688. from app.core.graph.graph_operations import create_or_get_node
  689. # 获取当前时间
  690. current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
  691. # 创建或更新人才节点
  692. talent_properties = {
  693. 'pg_id': card_id, # PostgreSQL数据库中的ID
  694. 'name_zh': card.name_zh, # 中文姓名
  695. 'name_en': card.name_en, # 英文姓名
  696. 'mobile': card.mobile, # 手机号码
  697. 'email': card.email, # 电子邮箱
  698. 'updated_at': current_time # 更新时间
  699. }
  700. talent_node_id = create_or_get_node('talent', **talent_properties)
  701. # 如果有酒店信息,创建或更新酒店节点
  702. if card.hotel_zh or card.hotel_en:
  703. hotel_properties = {
  704. 'hotel_zh': card.hotel_zh, # 酒店中文名称
  705. 'hotel_en': card.hotel_en, # 酒店英文名称
  706. 'updated_at': current_time # 更新时间
  707. }
  708. hotel_node_id = create_or_get_node('hotel', **hotel_properties)
  709. # 创建或更新人才与酒店之间的WORK_FOR关系
  710. if talent_node_id and hotel_node_id:
  711. # 构建Cypher查询以创建或更新关系
  712. cypher_query = """
  713. MATCH (t:talent), (h:hotel)
  714. WHERE id(t) = $talent_id AND id(h) = $hotel_id
  715. MERGE (t)-[r:WORKS_FOR]->(h)
  716. SET r.title_zh = $title_zh,
  717. r.title_en = $title_en,
  718. r.updated_at = $updated_at
  719. RETURN r
  720. """
  721. with neo4j_driver.get_session() as session:
  722. session.run(
  723. cypher_query,
  724. talent_id=talent_node_id,
  725. hotel_id=hotel_node_id,
  726. title_zh=card.title_zh,
  727. title_en=card.title_en,
  728. updated_at=current_time
  729. )
  730. logging.info(f"已成功更新人才(ID:{talent_node_id})与酒店(ID:{hotel_node_id})的WORK_FOR关系")
  731. logging.info(f"Neo4j图数据库关系更新成功")
  732. except Exception as e:
  733. logging.error(f"更新Neo4j图数据库关系失败: {str(e)}", exc_info=True)
  734. # 不因为图数据库更新失败而影响PostgreSQL数据库的更新结果
  735. return {
  736. 'code': 200,
  737. 'success': True,
  738. 'message': '名片信息已更新',
  739. 'data': card.to_dict()
  740. }
  741. except Exception as e:
  742. db.session.rollback()
  743. error_msg = f"更新名片信息失败: {str(e)}"
  744. logging.error(error_msg, exc_info=True)
  745. return {
  746. 'code': 500,
  747. 'success': False,
  748. 'message': error_msg,
  749. 'data': None
  750. }
  751. def get_business_cards():
  752. """
  753. 获取所有名片记录列表
  754. Returns:
  755. dict: 包含操作结果和名片列表
  756. """
  757. try:
  758. # 查询所有名片记录
  759. cards = BusinessCard.query.all()
  760. # 将所有记录转换为字典格式
  761. cards_data = [card.to_dict() for card in cards]
  762. return {
  763. 'code': 200,
  764. 'success': True,
  765. 'message': '获取名片列表成功',
  766. 'data': cards_data
  767. }
  768. except Exception as e:
  769. error_msg = f"获取名片列表失败: {str(e)}"
  770. logging.error(error_msg, exc_info=True)
  771. return {
  772. 'code': 500,
  773. 'success': False,
  774. 'message': error_msg,
  775. 'data': []
  776. }
  777. def update_business_card_status(card_id, status):
  778. """
  779. 更新名片状态(激活/禁用)
  780. Args:
  781. card_id (int): 名片记录ID
  782. status (str): 新状态,'active'或'inactive'
  783. Returns:
  784. dict: 包含操作结果和更新后的名片信息
  785. """
  786. try:
  787. # 查找要更新的名片记录
  788. card = BusinessCard.query.get(card_id)
  789. if not card:
  790. return {
  791. 'code': 500,
  792. 'success': False,
  793. 'message': f'未找到ID为{card_id}的名片记录',
  794. 'data': None
  795. }
  796. # 验证状态值
  797. if status not in ['active', 'inactive']:
  798. return {
  799. 'code': 500,
  800. 'success': False,
  801. 'message': f'无效的状态值: {status},必须为 active 或 inactive',
  802. 'data': None
  803. }
  804. # 更新状态
  805. card.status = status
  806. card.updated_at = datetime.now()
  807. card.updated_by = 'system' # 可以根据实际情况修改为当前用户
  808. # 保存更新
  809. db.session.commit()
  810. return {
  811. 'code': 200,
  812. 'success': True,
  813. 'message': f'名片状态已更新为: {status}',
  814. 'data': card.to_dict()
  815. }
  816. except Exception as e:
  817. db.session.rollback()
  818. error_msg = f"更新名片状态失败: {str(e)}"
  819. logging.error(error_msg, exc_info=True)
  820. return {
  821. 'code': 500,
  822. 'success': False,
  823. 'message': error_msg,
  824. 'data': None
  825. }
  826. def create_talent_tag(tag_data):
  827. """
  828. 创建人才标签节点
  829. Args:
  830. tag_data: 包含标签信息的字典,包括:
  831. - name: 标签名称
  832. - category: 标签分类
  833. - description: 标签描述
  834. - status: 启用状态
  835. Returns:
  836. dict: 操作结果字典
  837. """
  838. try:
  839. from app.services.neo4j_driver import neo4j_driver
  840. # 验证必要参数存在
  841. if not tag_data or 'name' not in tag_data or not tag_data['name']:
  842. return {
  843. 'code': 400,
  844. 'success': False,
  845. 'message': '标签名称为必填项',
  846. 'data': None
  847. }
  848. # 准备节点属性
  849. tag_properties = {
  850. 'name': tag_data.get('name'),
  851. 'category': tag_data.get('category', '未分类'),
  852. 'describe': tag_data.get('description', ''), # 使用describe与现有系统保持一致
  853. 'status': tag_data.get('status', 'active'),
  854. 'time': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
  855. }
  856. # 生成标签的英文名(可选)
  857. from app.core.graph.graph_operations import create_or_get_node
  858. # 如果提供了名称,尝试获取英文翻译
  859. if 'name' in tag_data and tag_data['name']:
  860. try:
  861. from app.api.data_interface.routes import translate_and_parse
  862. en_name = translate_and_parse(tag_data['name'])
  863. tag_properties['en_name'] = en_name[0] if en_name and isinstance(en_name, list) else ''
  864. except Exception as e:
  865. logging.warning(f"获取标签英文名失败: {str(e)}")
  866. tag_properties['en_name'] = ''
  867. # 创建节点
  868. node_id = create_or_get_node('data_label', **tag_properties)
  869. if node_id:
  870. return {
  871. 'code': 200,
  872. 'success': True,
  873. 'message': '人才标签创建成功',
  874. 'data': {
  875. 'id': node_id,
  876. **tag_properties
  877. }
  878. }
  879. else:
  880. return {
  881. 'code': 500,
  882. 'success': False,
  883. 'message': '人才标签创建失败',
  884. 'data': None
  885. }
  886. except Exception as e:
  887. logging.error(f"创建人才标签失败: {str(e)}", exc_info=True)
  888. return {
  889. 'code': 500,
  890. 'success': False,
  891. 'message': f'创建人才标签失败: {str(e)}',
  892. 'data': None
  893. }
  894. def get_talent_tag_list():
  895. """
  896. 从Neo4j图数据库获取人才标签列表
  897. Returns:
  898. dict: 包含操作结果和标签列表的字典
  899. """
  900. try:
  901. from app.services.neo4j_driver import neo4j_driver
  902. # 构建Cypher查询语句,获取分类为talent的标签
  903. query = """
  904. MATCH (n:data_label)
  905. WHERE n.category CONTAINS 'talent' OR n.category CONTAINS '人才'
  906. RETURN id(n) as id, n.name as name, n.en_name as en_name,
  907. n.category as category, n.describe as description,
  908. n.status as status, n.time as time
  909. ORDER BY n.time DESC
  910. """
  911. # 执行查询
  912. tags = []
  913. with neo4j_driver.get_session() as session:
  914. result = session.run(query)
  915. # 处理查询结果
  916. for record in result:
  917. tag = {
  918. 'id': record['id'],
  919. 'name': record['name'],
  920. 'en_name': record['en_name'],
  921. 'category': record['category'],
  922. 'description': record['description'],
  923. 'status': record['status'],
  924. 'time': record['time']
  925. }
  926. tags.append(tag)
  927. return {
  928. 'code': 200,
  929. 'success': True,
  930. 'message': '获取人才标签列表成功',
  931. 'data': tags
  932. }
  933. except Exception as e:
  934. error_msg = f"获取人才标签列表失败: {str(e)}"
  935. logging.error(error_msg, exc_info=True)
  936. return {
  937. 'code': 500,
  938. 'success': False,
  939. 'message': error_msg,
  940. 'data': []
  941. }
  942. def update_talent_tag(tag_id, tag_data):
  943. """
  944. 更新人才标签节点属性
  945. Args:
  946. tag_id: 标签节点ID
  947. tag_data: 包含更新信息的字典,可能包括:
  948. - name: 标签名称
  949. - category: 标签分类
  950. - description: 标签描述
  951. - status: 启用状态
  952. Returns:
  953. dict: 操作结果字典
  954. """
  955. try:
  956. from app.services.neo4j_driver import neo4j_driver
  957. # 准备要更新的属性
  958. update_properties = {}
  959. # 检查并添加需要更新的属性
  960. if 'name' in tag_data and tag_data['name']:
  961. update_properties['name'] = tag_data['name']
  962. # 如果名称更新了,尝试更新英文名称
  963. try:
  964. from app.api.data_interface.routes import translate_and_parse
  965. en_name = translate_and_parse(tag_data['name'])
  966. update_properties['en_name'] = en_name[0] if en_name and isinstance(en_name, list) else ''
  967. except Exception as e:
  968. logging.warning(f"更新标签英文名失败: {str(e)}")
  969. if 'category' in tag_data and tag_data['category']:
  970. update_properties['category'] = tag_data['category']
  971. if 'description' in tag_data:
  972. update_properties['describe'] = tag_data['description']
  973. if 'status' in tag_data:
  974. update_properties['status'] = tag_data['status']
  975. # 添加更新时间
  976. update_properties['time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
  977. # 如果没有可更新的属性,返回错误
  978. if not update_properties:
  979. return {
  980. 'code': 400,
  981. 'success': False,
  982. 'message': '未提供任何可更新的属性',
  983. 'data': None
  984. }
  985. # 构建更新的Cypher查询
  986. set_clauses = []
  987. params = {'nodeId': tag_id}
  988. for key, value in update_properties.items():
  989. param_name = f"param_{key}"
  990. set_clauses.append(f"n.{key} = ${param_name}")
  991. params[param_name] = value
  992. set_clause = ", ".join(set_clauses)
  993. query = f"""
  994. MATCH (n:data_label)
  995. WHERE id(n) = $nodeId
  996. SET {set_clause}
  997. RETURN id(n) as id, n.name as name, n.en_name as en_name,
  998. n.category as category, n.describe as description,
  999. n.status as status, n.time as time
  1000. """
  1001. # 执行更新查询
  1002. with neo4j_driver.get_session() as session:
  1003. result = session.run(query, **params)
  1004. record = result.single()
  1005. if not record:
  1006. return {
  1007. 'code': 404,
  1008. 'success': False,
  1009. 'message': f'未找到ID为{tag_id}的标签',
  1010. 'data': None
  1011. }
  1012. # 提取更新后的标签信息
  1013. updated_tag = {
  1014. 'id': record['id'],
  1015. 'name': record['name'],
  1016. 'en_name': record['en_name'],
  1017. 'category': record['category'],
  1018. 'description': record['description'],
  1019. 'status': record['status'],
  1020. 'time': record['time']
  1021. }
  1022. return {
  1023. 'code': 200,
  1024. 'success': True,
  1025. 'message': '人才标签更新成功',
  1026. 'data': updated_tag
  1027. }
  1028. except Exception as e:
  1029. error_msg = f"更新人才标签失败: {str(e)}"
  1030. logging.error(error_msg, exc_info=True)
  1031. return {
  1032. 'code': 500,
  1033. 'success': False,
  1034. 'message': error_msg,
  1035. 'data': None
  1036. }
  1037. def delete_talent_tag(tag_id):
  1038. """
  1039. 删除人才标签节点及其相关关系
  1040. Args:
  1041. tag_id: 标签节点ID
  1042. Returns:
  1043. dict: 操作结果字典
  1044. """
  1045. try:
  1046. from app.services.neo4j_driver import neo4j_driver
  1047. # 首先获取要删除的标签信息,以便在成功后返回
  1048. get_query = """
  1049. MATCH (n:data_label)
  1050. WHERE id(n) = $nodeId
  1051. RETURN id(n) as id, n.name as name, n.en_name as en_name,
  1052. n.category as category, n.describe as description,
  1053. n.status as status, n.time as time
  1054. """
  1055. # 构建删除节点和关系的Cypher查询
  1056. delete_query = """
  1057. MATCH (n:data_label)
  1058. WHERE id(n) = $nodeId
  1059. OPTIONAL MATCH (n)-[r]-()
  1060. DELETE r, n
  1061. RETURN count(n) AS deleted
  1062. """
  1063. # 执行查询
  1064. tag_info = None
  1065. with neo4j_driver.get_session() as session:
  1066. # 先获取标签信息
  1067. result = session.run(get_query, nodeId=tag_id)
  1068. record = result.single()
  1069. if not record:
  1070. return {
  1071. 'code': 404,
  1072. 'success': False,
  1073. 'message': f'未找到ID为{tag_id}的标签',
  1074. 'data': None
  1075. }
  1076. # 保存标签信息用于返回
  1077. tag_info = {
  1078. 'id': record['id'],
  1079. 'name': record['name'],
  1080. 'en_name': record['en_name'],
  1081. 'category': record['category'],
  1082. 'description': record['description'],
  1083. 'status': record['status'],
  1084. 'time': record['time']
  1085. }
  1086. # 执行删除操作
  1087. delete_result = session.run(delete_query, nodeId=tag_id)
  1088. deleted = delete_result.single()['deleted']
  1089. if deleted > 0:
  1090. return {
  1091. 'code': 200,
  1092. 'success': True,
  1093. 'message': '人才标签删除成功',
  1094. 'data': tag_info
  1095. }
  1096. else:
  1097. return {
  1098. 'code': 404,
  1099. 'success': False,
  1100. 'message': f'未能删除ID为{tag_id}的标签',
  1101. 'data': None
  1102. }
  1103. except Exception as e:
  1104. error_msg = f"删除人才标签失败: {str(e)}"
  1105. logging.error(error_msg, exc_info=True)
  1106. return {
  1107. 'code': 500,
  1108. 'success': False,
  1109. 'message': error_msg,
  1110. 'data': None
  1111. }
  1112. def query_neo4j_graph(query_requirement):
  1113. """
  1114. 查询Neo4j图数据库,通过Deepseek API生成Cypher脚本
  1115. Args:
  1116. query_requirement (str): 查询需求描述
  1117. Returns:
  1118. dict: 包含查询结果的字典,JSON格式
  1119. """
  1120. try:
  1121. # 导入必要的模块
  1122. from app.services.neo4j_driver import neo4j_driver
  1123. import requests
  1124. import json
  1125. # Deepseek API配置
  1126. api_key = DEEPSEEK_API_KEY
  1127. api_url = DEEPSEEK_API_URL
  1128. # 步骤1: 从Neo4j获取所有标签列表
  1129. logging.info("第一步:从Neo4j获取人才类别的标签列表")
  1130. all_labels_query = """
  1131. MATCH (dl:data_label)
  1132. WHERE dl.category CONTAINS '人才' OR dl.category CONTAINS 'talent'
  1133. RETURN dl.name as name
  1134. """
  1135. all_labels = []
  1136. with neo4j_driver.get_session() as session:
  1137. result = session.run(all_labels_query)
  1138. for record in result:
  1139. all_labels.append(record['name'])
  1140. logging.info(f"获取到{len(all_labels)}个人才标签: {all_labels}")
  1141. # 步骤2: 使用Deepseek判断查询需求中的关键信息与标签的对应关系
  1142. logging.info("第二步:调用Deepseek API匹配查询需求与标签")
  1143. # 构建所有标签的JSON字符串
  1144. labels_json = json.dumps(all_labels, ensure_ascii=False)
  1145. # 构建匹配标签的提示语
  1146. matching_prompt = f"""
  1147. 请分析以下查询需求,并从标签列表中找出与查询需求相关的标签。
  1148. ## 查询需求
  1149. {query_requirement}
  1150. ## 可用标签列表
  1151. {labels_json}
  1152. ## 输出要求
  1153. 1. 请以JSON数组格式返回匹配的标签名称列表,格式如: ["标签1", "标签2", "标签3"]
  1154. 2. 只返回标签名称数组,不要包含任何解释或其他文本
  1155. 3. 如果没有找到匹配的标签,请返回空数组 []
  1156. """
  1157. # 调用Deepseek API匹配标签
  1158. headers = {
  1159. "Authorization": f"Bearer {api_key}",
  1160. "Content-Type": "application/json"
  1161. }
  1162. payload = {
  1163. "model": "deepseek-chat",
  1164. "messages": [
  1165. {"role": "system", "content": "你是一个专业的文本分析和匹配专家。"},
  1166. {"role": "user", "content": matching_prompt}
  1167. ],
  1168. "temperature": 0.1,
  1169. "response_format": {"type": "json_object"}
  1170. }
  1171. logging.info("发送请求到Deepseek API匹配标签:"+matching_prompt)
  1172. response = requests.post(api_url, headers=headers, json=payload, timeout=30)
  1173. response.raise_for_status()
  1174. # 解析API响应
  1175. result = response.json()
  1176. matching_content = result.get("choices", [{}])[0].get("message", {}).get("content", "[]")
  1177. # 提取JSON数组
  1178. try:
  1179. # 尝试直接解析返回结果,预期格式为 ["新开酒店经验", "五星级酒店", "总经理"]
  1180. logging.info(f"Deepseek返回的匹配内容: {matching_content}")
  1181. # 如果返回的是JSON字符串,先去除可能的前后缀文本
  1182. if isinstance(matching_content, str):
  1183. # 查找JSON数组的开始和结束位置
  1184. start_idx = matching_content.find('[')
  1185. end_idx = matching_content.rfind(']') + 1
  1186. if start_idx >= 0 and end_idx > start_idx:
  1187. json_str = matching_content[start_idx:end_idx]
  1188. matched_labels = json.loads(json_str)
  1189. else:
  1190. matched_labels = []
  1191. else:
  1192. matched_labels = []
  1193. # 确保结果是字符串列表
  1194. if matched_labels and all(isinstance(item, str) for item in matched_labels):
  1195. logging.info(f"成功解析到标签列表: {matched_labels}")
  1196. else:
  1197. logging.warning("解析结果不是预期的字符串列表格式,将使用空列表")
  1198. matched_labels = []
  1199. except json.JSONDecodeError as e:
  1200. logging.error(f"JSON解析错误: {str(e)}")
  1201. matched_labels = []
  1202. except Exception as e:
  1203. logging.error(f"解析匹配标签时出错: {str(e)}")
  1204. matched_labels = []
  1205. logging.info(f"匹配到的标签: {matched_labels}")
  1206. # 如果没有匹配到标签,返回空结果
  1207. if not matched_labels:
  1208. return {
  1209. 'code': 200,
  1210. 'success': True,
  1211. 'message': '未找到与查询需求匹配的标签',
  1212. 'query': '',
  1213. 'data': []
  1214. }
  1215. # 步骤3: 构建Cypher生成提示文本
  1216. logging.info("第三步:构建提示文本生成Cypher查询语句")
  1217. # 将匹配的标签转换为字符串
  1218. matched_labels_str = ", ".join([f"'{label}'" for label in matched_labels])
  1219. # 构建生成Cypher的提示语
  1220. cypher_prompt = f"""
  1221. 请根据以下Neo4j图数据库结构和已匹配的标签,生成一个Cypher查询脚本。
  1222. ## 图数据库结构
  1223. ### 节点
  1224. 1. talent - 人才节点
  1225. 属性: pg_id(PostgreSQL数据库ID), name_zh(中文姓名), name_en(英文姓名),
  1226. mobile(手机号码), email(电子邮箱), updated_at(更新时间)
  1227. 2. data_label - 人才标签节点
  1228. ### 关系
  1229. BELONGS_TO - 从属关系
  1230. (talent)-[BELONGS_TO]->(data_label) - 人才属于某标签
  1231. ## 匹配的标签列表
  1232. [{matched_labels_str}]
  1233. ## 查询需求
  1234. {query_requirement}
  1235. ## 输出要求
  1236. 1. 只输出有效的Cypher查询语句,不要包含任何解释或注释
  1237. 2. 确保return语句中包含talent节点属性
  1238. 3. 尽量利用图数据库的特性来优化查询效率
  1239. 4. 使用WITH子句和COLLECT函数收集标签,确保查询到同时拥有所有标签的人才
  1240. 注意:请直接返回Cypher查询语句,无需任何其他文本。
  1241. 以下是一个示例:
  1242. 假设匹配的标签是 ['五星级酒店', '新开酒店经验', '总经理']
  1243. 生成的Cypher查询语句应该是:
  1244. MATCH (t:talent)-[:BELONGS_TO]->(dl:data_label)
  1245. WHERE dl.name IN ['五星级酒店', '新开酒店经验', '总经理']
  1246. WITH t, COLLECT(DISTINCT dl.name) AS labels
  1247. WHERE size(labels) = 3
  1248. RETURN t.pg_id as pg_id, t.name_zh as name_zh, t.name_en as name_en, t.mobile as mobile, t.email as email, t.updated_at as updated_at
  1249. """
  1250. # 调用Deepseek API生成Cypher脚本
  1251. payload = {
  1252. "model": "deepseek-chat",
  1253. "messages": [
  1254. {"role": "system", "content": "你是一个专业的Neo4j Cypher查询专家。"},
  1255. {"role": "user", "content": cypher_prompt}
  1256. ],
  1257. "temperature": 0.1
  1258. }
  1259. logging.info("发送请求到Deepseek API生成Cypher脚本")
  1260. response = requests.post(api_url, headers=headers, json=payload, timeout=30)
  1261. response.raise_for_status()
  1262. # 解析API响应
  1263. result = response.json()
  1264. cypher_script = result.get("choices", [{}])[0].get("message", {}).get("content", "")
  1265. # 清理Cypher脚本,移除不必要的markdown格式或注释
  1266. cypher_script = cypher_script.strip()
  1267. if cypher_script.startswith("```cypher"):
  1268. cypher_script = cypher_script[9:]
  1269. elif cypher_script.startswith("```"):
  1270. cypher_script = cypher_script[3:]
  1271. if cypher_script.endswith("```"):
  1272. cypher_script = cypher_script[:-3]
  1273. cypher_script = cypher_script.strip()
  1274. logging.info(f"生成的Cypher脚本: {cypher_script}")
  1275. # 步骤4: 执行Cypher脚本
  1276. logging.info("第四步:执行Cypher脚本并返回结果")
  1277. with neo4j_driver.get_session() as session:
  1278. result = session.run(cypher_script)
  1279. records = [record.data() for record in result]
  1280. # 构建查询结果
  1281. response_data = {
  1282. 'code': 200,
  1283. 'success': True,
  1284. 'message': '查询成功执行',
  1285. 'query': cypher_script,
  1286. 'matched_labels': matched_labels,
  1287. 'data': records
  1288. }
  1289. return response_data
  1290. except requests.exceptions.HTTPError as e:
  1291. error_msg = f"调用Deepseek API失败: {str(e)}"
  1292. logging.error(error_msg)
  1293. if hasattr(e, 'response') and e.response:
  1294. logging.error(f"错误状态码: {e.response.status_code}")
  1295. logging.error(f"错误内容: {e.response.text}")
  1296. return {
  1297. 'code': 500,
  1298. 'success': False,
  1299. 'message': error_msg,
  1300. 'data': []
  1301. }
  1302. except Exception as e:
  1303. error_msg = f"查询Neo4j图数据库失败: {str(e)}"
  1304. logging.error(error_msg, exc_info=True)
  1305. return {
  1306. 'code': 500,
  1307. 'success': False,
  1308. 'message': error_msg,
  1309. 'data': []
  1310. }
  1311. def talent_get_tags(talent_id):
  1312. """
  1313. 根据talent ID获取人才节点关联的标签
  1314. Args:
  1315. talent_id (int): 人才节点pg_id
  1316. Returns:
  1317. dict: 包含人才ID和关联标签的字典,JSON格式
  1318. """
  1319. try:
  1320. # 导入必要的模块
  1321. from app.services.neo4j_driver import neo4j_driver
  1322. # 准备查询返回数据
  1323. response_data = {
  1324. 'code': 200,
  1325. 'success': True,
  1326. 'message': '获取人才标签成功',
  1327. 'data': []
  1328. }
  1329. # 构建Cypher查询语句,获取人才节点关联的标签
  1330. cypher_query = """
  1331. MATCH (t:talent)-[r:BELONGS_TO]->(tag:data_label)
  1332. WHERE t.pg_id = $talent_id
  1333. RETURN t.pg_id as talent_id, tag.name as tag_name
  1334. """
  1335. # 执行查询
  1336. with neo4j_driver.get_session() as session:
  1337. result = session.run(cypher_query, talent_id=int(talent_id))
  1338. records = list(result)
  1339. # 如果没有查询到标签,返回空数组
  1340. if not records:
  1341. response_data['message'] = f'人才pg_id {talent_id} 没有关联的标签'
  1342. return response_data
  1343. # 处理查询结果
  1344. for record in records:
  1345. talent_tag = {
  1346. 'talent': record['talent_id'],
  1347. 'tag': record['tag_name']
  1348. }
  1349. response_data['data'].append(talent_tag)
  1350. return response_data
  1351. except Exception as e:
  1352. error_msg = f"获取人才标签失败: {str(e)}"
  1353. logging.error(error_msg, exc_info=True)
  1354. return {
  1355. 'code': 500,
  1356. 'success': False,
  1357. 'message': error_msg,
  1358. 'data': []
  1359. }
  1360. def talent_update_tags(data):
  1361. """
  1362. 根据传入的JSON数据为人才节点创建与标签的BELONGS_TO关系
  1363. Args:
  1364. data (list): 包含talent和tag字段的对象列表
  1365. 例如: [
  1366. {"talent": 12345, "tag": "市场营销"},
  1367. {"talent": 12345, "tag": "酒店管理"}
  1368. ]
  1369. Returns:
  1370. dict: 操作结果和状态信息
  1371. """
  1372. try:
  1373. # 导入必要的模块
  1374. from app.services.neo4j_driver import neo4j_driver
  1375. # 验证输入参数
  1376. if not isinstance(data, list):
  1377. return {
  1378. 'code': 400,
  1379. 'success': False,
  1380. 'message': '参数格式错误,需要JSON数组',
  1381. 'data': None
  1382. }
  1383. if len(data) == 0:
  1384. return {
  1385. 'code': 400,
  1386. 'success': False,
  1387. 'message': '数据列表为空',
  1388. 'data': None
  1389. }
  1390. # 获取当前时间
  1391. current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
  1392. # 成功和失败计数
  1393. success_count = 0
  1394. failed_items = []
  1395. # 按talent分组处理数据
  1396. talent_tags = {}
  1397. for item in data:
  1398. # 验证每个项目的格式
  1399. if not isinstance(item, dict) or 'talent' not in item or 'tag' not in item:
  1400. failed_items.append(item)
  1401. continue
  1402. talent_id = item.get('talent')
  1403. tag_name = item.get('tag')
  1404. # 验证talent_id和tag_name的值
  1405. if not talent_id or not tag_name or not isinstance(tag_name, str):
  1406. failed_items.append(item)
  1407. continue
  1408. # 按talent_id分组
  1409. if talent_id not in talent_tags:
  1410. talent_tags[talent_id] = []
  1411. talent_tags[talent_id].append(tag_name)
  1412. with neo4j_driver.get_session() as session:
  1413. # 处理每个talent及其标签
  1414. for talent_id, tags in talent_tags.items():
  1415. # 首先验证talent节点是否存在
  1416. check_talent_query = """
  1417. MATCH (t:talent)
  1418. WHERE t.pg_id = $talent_id
  1419. RETURN t
  1420. """
  1421. talent_result = session.run(check_talent_query, talent_id=int(talent_id))
  1422. if not talent_result.single():
  1423. # 该talent不存在,记录失败项并继续下一个talent
  1424. for tag in tags:
  1425. failed_items.append({'talent_pg_id': talent_id, 'tag': tag})
  1426. continue
  1427. # 首先清除所有现有的BELONGS_TO关系
  1428. clear_relations_query = """
  1429. MATCH (t:talent)-[r:BELONGS_TO]->(:data_label)
  1430. WHERE t.pg_id = $talent_id
  1431. DELETE r
  1432. RETURN count(r) as deleted_count
  1433. """
  1434. clear_result = session.run(clear_relations_query, talent_id=int(talent_id))
  1435. deleted_count = clear_result.single()['deleted_count']
  1436. logging.info(f"已删除talent_id={talent_id}的{deleted_count}个已有标签关系")
  1437. # 处理每个标签
  1438. for tag_name in tags:
  1439. try:
  1440. # 1. 查找或创建标签节点
  1441. # 先查找是否存在该标签
  1442. find_tag_query = """
  1443. MATCH (tag:data_label)
  1444. WHERE tag.name = $tag_name
  1445. RETURN id(tag) as tag_id
  1446. """
  1447. tag_result = session.run(find_tag_query, tag_name=tag_name)
  1448. tag_record = tag_result.single()
  1449. if tag_record:
  1450. tag_id = tag_record['tag_id']
  1451. else:
  1452. # 创建新标签
  1453. create_tag_query = """
  1454. CREATE (tag:data_label {name: $name, category: $category, updated_at: $updated_at})
  1455. RETURN id(tag) as tag_id
  1456. """
  1457. tag_result = session.run(
  1458. create_tag_query,
  1459. name=tag_name,
  1460. category='talent',
  1461. updated_at=current_time
  1462. )
  1463. tag_record = tag_result.single()
  1464. tag_id = tag_record['tag_id']
  1465. # 2. 创建人才与标签的BELONGS_TO关系
  1466. create_relation_query = """
  1467. MATCH (t:talent), (tag:data_label)
  1468. WHERE t.pg_id = $talent_id AND tag.name = $tag_name
  1469. CREATE (t)-[r:BELONGS_TO]->(tag)
  1470. SET r.created_at = $current_time
  1471. RETURN r
  1472. """
  1473. relation_result = session.run(
  1474. create_relation_query,
  1475. talent_id=int(talent_id),
  1476. tag_name=tag_name,
  1477. current_time=current_time
  1478. )
  1479. if relation_result.single():
  1480. success_count += 1
  1481. else:
  1482. failed_items.append({'talent_pg_id': talent_id, 'tag': tag_name})
  1483. except Exception as tag_error:
  1484. logging.error(f"为标签 {tag_name} 创建关系时出错: {str(tag_error)}")
  1485. failed_items.append({'talent_pg_id': talent_id, 'tag': tag_name})
  1486. # 返回结果
  1487. total_items = len(data)
  1488. if success_count == total_items:
  1489. return {
  1490. 'code': 200,
  1491. 'success': True,
  1492. 'message': f'成功创建或更新了 {success_count} 个标签关系',
  1493. 'data': {
  1494. 'success_count': success_count,
  1495. 'total_count': total_items,
  1496. 'failed_items': []
  1497. }
  1498. }
  1499. elif success_count > 0:
  1500. return {
  1501. 'code': 206, # Partial Content
  1502. 'success': True,
  1503. 'message': f'部分成功: 创建或更新了 {success_count}/{total_items} 个标签关系',
  1504. 'data': {
  1505. 'success_count': success_count,
  1506. 'total_count': total_items,
  1507. 'failed_items': failed_items
  1508. }
  1509. }
  1510. else:
  1511. return {
  1512. 'code': 500,
  1513. 'success': False,
  1514. 'message': '无法创建任何标签关系',
  1515. 'data': {
  1516. 'success_count': 0,
  1517. 'total_count': total_items,
  1518. 'failed_items': failed_items
  1519. }
  1520. }
  1521. except Exception as e:
  1522. error_msg = f"更新人才标签关系失败: {str(e)}"
  1523. logging.error(error_msg, exc_info=True)
  1524. return {
  1525. 'code': 500,
  1526. 'success': False,
  1527. 'message': error_msg,
  1528. 'data': None
  1529. }
  1530. def get_business_card(card_id):
  1531. """
  1532. 根据ID从PostgreSQL数据库中获取名片记录
  1533. Args:
  1534. card_id (int): 名片记录ID
  1535. Returns:
  1536. dict: 包含操作结果和名片信息的字典
  1537. """
  1538. try:
  1539. # 查询指定ID的名片记录
  1540. card = BusinessCard.query.get(card_id)
  1541. if not card:
  1542. return {
  1543. 'code': 404,
  1544. 'success': False,
  1545. 'message': f'未找到ID为{card_id}的名片记录',
  1546. 'data': None
  1547. }
  1548. # 将记录转换为字典格式返回
  1549. return {
  1550. 'code': 200,
  1551. 'success': True,
  1552. 'message': '获取名片记录成功',
  1553. 'data': card.to_dict()
  1554. }
  1555. except Exception as e:
  1556. error_msg = f"获取名片记录失败: {str(e)}"
  1557. logging.error(error_msg, exc_info=True)
  1558. return {
  1559. 'code': 500,
  1560. 'success': False,
  1561. 'message': error_msg,
  1562. 'data': None
  1563. }