parse.py 75 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154
  1. from typing import Dict, Any
  2. from app import db
  3. from datetime import datetime
  4. import os
  5. import boto3
  6. from botocore.config import Config
  7. import logging
  8. import requests
  9. import json
  10. import re
  11. import uuid
  12. from PIL import Image
  13. from io import BytesIO
  14. import pytesseract
  15. import base64
  16. from openai import OpenAI
  17. from app.config.config import DevelopmentConfig, ProductionConfig
  18. # 名片解析数据模型
  19. class BusinessCard(db.Model):
  20. __tablename__ = 'business_cards'
  21. id = db.Column(db.Integer, primary_key=True, autoincrement=True)
  22. name_zh = db.Column(db.String(100), nullable=False)
  23. name_en = db.Column(db.String(100))
  24. title_zh = db.Column(db.String(100))
  25. title_en = db.Column(db.String(100))
  26. mobile = db.Column(db.String(100))
  27. phone = db.Column(db.String(50))
  28. email = db.Column(db.String(100))
  29. hotel_zh = db.Column(db.String(200))
  30. hotel_en = db.Column(db.String(200))
  31. address_zh = db.Column(db.Text)
  32. address_en = db.Column(db.Text)
  33. postal_code_zh = db.Column(db.String(20))
  34. postal_code_en = db.Column(db.String(20))
  35. brand_zh = db.Column(db.String(100))
  36. brand_en = db.Column(db.String(100))
  37. affiliation_zh = db.Column(db.String(200))
  38. affiliation_en = db.Column(db.String(200))
  39. birthday = db.Column(db.Date) # 生日,存储年月日
  40. age = db.Column(db.Integer) # 年龄字段
  41. native_place = db.Column(db.Text) # 籍贯字段
  42. residence = db.Column(db.Text) # 居住地
  43. image_path = db.Column(db.String(255)) # MinIO中存储的路径
  44. career_path = db.Column(db.JSON) # 职业轨迹,JSON格式
  45. brand_group = db.Column(db.String(200)) # 品牌组合
  46. origin_source = db.Column(db.JSON) # 原始资料记录,JSON格式
  47. talent_profile = db.Column(db.Text) # 人才档案,文本格式
  48. created_at = db.Column(db.DateTime, default=datetime.now, nullable=False)
  49. updated_at = db.Column(db.DateTime, onupdate=datetime.now)
  50. updated_by = db.Column(db.String(50))
  51. status = db.Column(db.String(20), default='active')
  52. def to_dict(self):
  53. return {
  54. 'id': self.id,
  55. 'name_zh': self.name_zh,
  56. 'name_en': self.name_en,
  57. 'title_zh': self.title_zh,
  58. 'title_en': self.title_en,
  59. 'mobile': self.mobile,
  60. 'phone': self.phone,
  61. 'email': self.email,
  62. 'hotel_zh': self.hotel_zh,
  63. 'hotel_en': self.hotel_en,
  64. 'address_zh': self.address_zh,
  65. 'address_en': self.address_en,
  66. 'postal_code_zh': self.postal_code_zh,
  67. 'postal_code_en': self.postal_code_en,
  68. 'brand_zh': self.brand_zh,
  69. 'brand_en': self.brand_en,
  70. 'affiliation_zh': self.affiliation_zh,
  71. 'affiliation_en': self.affiliation_en,
  72. 'birthday': self.birthday.strftime('%Y-%m-%d') if self.birthday else None,
  73. 'age': self.age,
  74. 'native_place': self.native_place,
  75. 'residence': self.residence,
  76. 'image_path': self.image_path,
  77. 'career_path': self.career_path,
  78. 'brand_group': self.brand_group,
  79. 'origin_source': self.origin_source,
  80. 'talent_profile': self.talent_profile,
  81. 'created_at': self.created_at.strftime('%Y-%m-%d %H:%M:%S') if self.created_at else None,
  82. 'updated_at': self.updated_at.strftime('%Y-%m-%d %H:%M:%S') if self.updated_at else None,
  83. 'updated_by': self.updated_by,
  84. 'status': self.status
  85. }
  86. # 重复名片处理数据模型
  87. class DuplicateBusinessCard(db.Model):
  88. __tablename__ = 'duplicate_business_cards'
  89. id = db.Column(db.Integer, primary_key=True, autoincrement=True)
  90. main_card_id = db.Column(db.Integer, db.ForeignKey('business_cards.id'), nullable=False) # 新创建的主记录ID
  91. suspected_duplicates = db.Column(db.JSON, nullable=False) # 疑似重复记录列表,JSON格式
  92. duplicate_reason = db.Column(db.String(200), nullable=False) # 重复原因
  93. processing_status = db.Column(db.String(20), default='pending') # 处理状态:pending/processed/ignored
  94. created_at = db.Column(db.DateTime, default=datetime.now, nullable=False)
  95. processed_at = db.Column(db.DateTime) # 处理时间
  96. processed_by = db.Column(db.String(50)) # 处理人
  97. processing_notes = db.Column(db.Text) # 处理备注
  98. # 关联主记录
  99. main_card = db.relationship('BusinessCard', backref=db.backref('as_main_duplicate_records', lazy=True))
  100. def to_dict(self):
  101. return {
  102. 'id': self.id,
  103. 'main_card_id': self.main_card_id,
  104. 'suspected_duplicates': self.suspected_duplicates,
  105. 'duplicate_reason': self.duplicate_reason,
  106. 'processing_status': self.processing_status,
  107. 'created_at': self.created_at.strftime('%Y-%m-%d %H:%M:%S') if self.created_at else None,
  108. 'processed_at': self.processed_at.strftime('%Y-%m-%d %H:%M:%S') if self.processed_at else None,
  109. 'processed_by': self.processed_by,
  110. 'processing_notes': self.processing_notes
  111. }
  112. # 解析任务存储库数据模型
  113. class ParseTaskRepository(db.Model):
  114. __tablename__ = 'parse_task_repository'
  115. id = db.Column(db.Integer, primary_key=True, autoincrement=True)
  116. task_name = db.Column(db.String(100), nullable=False)
  117. task_status = db.Column(db.String(10), nullable=False)
  118. task_type = db.Column(db.String(50), nullable=False)
  119. task_source = db.Column(db.String(300), nullable=False)
  120. collection_count = db.Column(db.Integer, nullable=False, default=0)
  121. parse_count = db.Column(db.Integer, nullable=False, default=0)
  122. parse_result = db.Column(db.JSON)
  123. created_at = db.Column(db.DateTime, default=datetime.now, nullable=False)
  124. created_by = db.Column(db.String(50), nullable=False)
  125. updated_at = db.Column(db.DateTime, default=datetime.now, onupdate=datetime.now, nullable=False)
  126. updated_by = db.Column(db.String(50), nullable=False)
  127. def to_dict(self):
  128. return {
  129. 'id': self.id,
  130. 'task_name': self.task_name,
  131. 'task_status': self.task_status,
  132. 'task_type': self.task_type,
  133. 'task_source': self.task_source,
  134. 'collection_count': self.collection_count,
  135. 'parse_count': self.parse_count,
  136. 'parse_result': self.parse_result,
  137. 'created_at': self.created_at.strftime('%Y-%m-%d %H:%M:%S') if self.created_at else None,
  138. 'created_by': self.created_by,
  139. 'updated_at': self.updated_at.strftime('%Y-%m-%d %H:%M:%S') if self.updated_at else None,
  140. 'updated_by': self.updated_by
  141. }
  142. # 配置变量
  143. # 使用配置变量,缺省认为在生产环境运行
  144. config = ProductionConfig()
  145. # 使用配置变量
  146. minio_url = f"{'https' if getattr(config, 'MINIO_SECURE', False) else 'http'}://{getattr(config, 'MINIO_HOST', 'localhost')}"
  147. minio_access_key = getattr(config, 'MINIO_USER', 'minioadmin')
  148. minio_secret_key = getattr(config, 'MINIO_PASSWORD', 'minioadmin')
  149. minio_bucket = getattr(config, 'MINIO_BUCKET', 'dataops')
  150. use_ssl = getattr(config, 'MINIO_SECURE', False)
  151. # API密钥配置
  152. DEEPSEEK_API_KEY = getattr(config, 'DEEPSEEK_API_KEY', '')
  153. DEEPSEEK_API_URL = getattr(config, 'DEEPSEEK_API_URL', 'https://api.deepseek.com/v1/chat/completions')
  154. QWEN_API_KEY = getattr(config, 'QWEN_API_KEY', '')
  155. QWEN_API_URL = getattr(config, 'QWEN_API_URL', 'https://dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation')
  156. # OCR配置
  157. OCR_LANG = getattr(config, 'OCR_LANG', 'chi_sim+eng')
  158. # 名片解析功能模块
  159. def normalize_mobile_numbers(mobile_str):
  160. """
  161. 标准化手机号码字符串,去重并限制最多3个
  162. Args:
  163. mobile_str (str): 手机号码字符串,可能包含多个手机号码,用逗号分隔
  164. Returns:
  165. str: 标准化后的手机号码字符串,最多3个,用逗号分隔
  166. """
  167. if not mobile_str or not mobile_str.strip():
  168. return ''
  169. # 按逗号分割并清理每个手机号码
  170. mobiles = []
  171. for mobile in mobile_str.split(','):
  172. mobile = mobile.strip()
  173. if mobile and mobile not in mobiles: # 去重
  174. mobiles.append(mobile)
  175. # 限制最多3个手机号码
  176. return ','.join(mobiles[:3])
  177. def mobile_numbers_overlap(mobile1, mobile2):
  178. """
  179. 检查两个手机号码字符串是否有重叠
  180. Args:
  181. mobile1 (str): 第一个手机号码字符串
  182. mobile2 (str): 第二个手机号码字符串
  183. Returns:
  184. bool: 是否有重叠的手机号码
  185. """
  186. if not mobile1 or not mobile2:
  187. return False
  188. mobiles1 = set(mobile.strip() for mobile in mobile1.split(',') if mobile.strip())
  189. mobiles2 = set(mobile.strip() for mobile in mobile2.split(',') if mobile.strip())
  190. return bool(mobiles1 & mobiles2) # 检查交集
  191. def merge_mobile_numbers(existing_mobile, new_mobile):
  192. """
  193. 合并手机号码,去重并限制最多3个
  194. Args:
  195. existing_mobile (str): 现有手机号码字符串
  196. new_mobile (str): 新手机号码字符串
  197. Returns:
  198. str: 合并后的手机号码字符串,最多3个,用逗号分隔
  199. """
  200. mobiles = []
  201. # 添加现有手机号码
  202. if existing_mobile:
  203. for mobile in existing_mobile.split(','):
  204. mobile = mobile.strip()
  205. if mobile and mobile not in mobiles:
  206. mobiles.append(mobile)
  207. # 添加新手机号码
  208. if new_mobile:
  209. for mobile in new_mobile.split(','):
  210. mobile = mobile.strip()
  211. if mobile and mobile not in mobiles:
  212. mobiles.append(mobile)
  213. # 限制最多3个手机号码
  214. return ','.join(mobiles[:3])
  215. def check_duplicate_business_card(extracted_data):
  216. """
  217. 检查是否存在重复的名片记录
  218. Args:
  219. extracted_data (dict): 提取的名片信息
  220. Returns:
  221. dict: 包含检查结果的字典,格式为:
  222. {
  223. 'is_duplicate': bool,
  224. 'action': str, # 'update', 'create_with_duplicates' 或 'create_new'
  225. 'existing_card': BusinessCard 或 None,
  226. 'suspected_duplicates': list, # 疑似重复记录列表
  227. 'reason': str
  228. }
  229. """
  230. try:
  231. # 提取关键信息进行匹配
  232. name_zh = extracted_data.get('name_zh', '').strip()
  233. mobile = extracted_data.get('mobile', '').strip()
  234. # 如果没有姓名,无法进行有效的重复检测
  235. if not name_zh:
  236. return {
  237. 'is_duplicate': False,
  238. 'action': 'create_new',
  239. 'existing_card': None,
  240. 'suspected_duplicates': [],
  241. 'reason': '缺少中文姓名,无法进行重复检测'
  242. }
  243. # 根据姓名进行精确匹配
  244. name_matches = BusinessCard.query.filter_by(name_zh=name_zh).all()
  245. # 如果有手机号,同时检查手机号匹配
  246. mobile_matches = []
  247. if mobile:
  248. # 标准化手机号进行比较
  249. normalized_mobile = normalize_mobile_numbers(mobile)
  250. if normalized_mobile:
  251. # 查找所有有手机号的记录
  252. all_cards_with_mobile = BusinessCard.query.filter(BusinessCard.mobile.isnot(None)).all()
  253. for card in all_cards_with_mobile:
  254. if card.mobile and mobile_numbers_overlap(normalized_mobile, card.mobile):
  255. mobile_matches.append(card)
  256. # 合并姓名匹配和手机号匹配的结果,去重
  257. all_matches = []
  258. for card in name_matches + mobile_matches:
  259. if card not in all_matches:
  260. all_matches.append(card)
  261. if not all_matches:
  262. # 没有找到匹配记录,创建新记录
  263. return {
  264. 'is_duplicate': False,
  265. 'action': 'create_new',
  266. 'existing_card': None,
  267. 'suspected_duplicates': [],
  268. 'reason': '未找到重复记录,将创建新记录'
  269. }
  270. elif len(all_matches) == 1:
  271. # 找到一个匹配记录
  272. existing_card = all_matches[0]
  273. # 检查是否是完全匹配(姓名和手机号都相同)
  274. existing_mobile = existing_card.mobile or ''
  275. is_name_match = existing_card.name_zh == name_zh
  276. is_mobile_match = mobile and mobile_numbers_overlap(mobile, existing_mobile)
  277. if is_name_match and is_mobile_match:
  278. # 完全匹配,更新现有记录
  279. return {
  280. 'is_duplicate': True,
  281. 'action': 'update',
  282. 'existing_card': existing_card,
  283. 'suspected_duplicates': [],
  284. 'reason': f'找到完全匹配的记录 (ID: {existing_card.id}),将更新现有记录'
  285. }
  286. else:
  287. # 部分匹配,标记为疑似重复
  288. return {
  289. 'is_duplicate': True,
  290. 'action': 'create_with_duplicates',
  291. 'existing_card': None,
  292. 'suspected_duplicates': [existing_card],
  293. 'reason': f'找到疑似重复记录 (ID: {existing_card.id}),将创建新记录并标记重复'
  294. }
  295. else:
  296. # 找到多个匹配记录,标记为疑似重复
  297. return {
  298. 'is_duplicate': True,
  299. 'action': 'create_with_duplicates',
  300. 'existing_card': None,
  301. 'suspected_duplicates': all_matches,
  302. 'reason': f'找到 {len(all_matches)} 个疑似重复记录,将创建新记录并标记重复'
  303. }
  304. except Exception as e:
  305. logging.error(f"重复检测过程中发生错误: {str(e)}", exc_info=True)
  306. # 出错时默认创建新记录
  307. return {
  308. 'is_duplicate': False,
  309. 'action': 'create_new',
  310. 'existing_card': None,
  311. 'suspected_duplicates': [],
  312. 'reason': f'重复检测失败: {str(e)},将创建新记录'
  313. }
  314. def update_career_path(existing_card, new_data, image_path=None):
  315. """
  316. 更新名片的职业轨迹信息
  317. Args:
  318. existing_card: 现有的名片记录对象
  319. new_data (dict): 新的职位信息
  320. image_path (str): 新的图片路径
  321. Returns:
  322. list: 更新后的职业轨迹列表
  323. """
  324. try:
  325. # 获取现有的职业轨迹,如果没有则初始化为空列表
  326. career_path = existing_card.career_path if existing_card.career_path else []
  327. # 确保career_path是列表格式
  328. if not isinstance(career_path, list):
  329. career_path = []
  330. # 构建新的职业记录
  331. new_career_entry = {
  332. 'hotel_zh': new_data.get('hotel_zh', ''),
  333. 'hotel_en': new_data.get('hotel_en', ''),
  334. 'title_zh': new_data.get('title_zh', ''),
  335. 'title_en': new_data.get('title_en', ''),
  336. 'start_date': datetime.now().strftime('%Y-%m-%d'),
  337. 'image_path': image_path or existing_card.image_path
  338. }
  339. # 检查是否与最新的职业记录相同,避免重复添加
  340. if career_path:
  341. latest_entry = career_path[-1]
  342. if (latest_entry.get('hotel_zh') == new_career_entry['hotel_zh'] and
  343. latest_entry.get('title_zh') == new_career_entry['title_zh']):
  344. # 如果职位信息相同,只更新图片路径和时间
  345. latest_entry['image_path'] = new_career_entry['image_path']
  346. latest_entry['start_date'] = new_career_entry['start_date']
  347. return career_path
  348. # 添加新的职业记录
  349. career_path.append(new_career_entry)
  350. # 限制职业轨迹记录数量(最多保留10条)
  351. if len(career_path) > 10:
  352. career_path = career_path[-10:]
  353. return career_path
  354. except Exception as e:
  355. logging.error(f"更新职业轨迹失败: {str(e)}", exc_info=True)
  356. # 出错时返回原有的职业轨迹
  357. return existing_card.career_path if existing_card.career_path else []
  358. def create_main_card_with_duplicates(extracted_data, minio_path, suspected_duplicates, reason):
  359. """
  360. 创建主名片记录并标记疑似重复记录
  361. Args:
  362. extracted_data (dict): 提取的名片信息
  363. minio_path (str): MinIO中的图片路径
  364. suspected_duplicates (list): 疑似重复的名片记录列表
  365. reason (str): 重复原因描述
  366. Returns:
  367. BusinessCard: 创建的主名片记录
  368. """
  369. try:
  370. # 标准化手机号码
  371. mobile = normalize_mobile_numbers(extracted_data.get('mobile', ''))
  372. # 构建职业轨迹
  373. career_path = []
  374. if extracted_data.get('hotel_zh') or extracted_data.get('title_zh'):
  375. career_entry = {
  376. 'hotel_zh': extracted_data.get('hotel_zh', ''),
  377. 'hotel_en': extracted_data.get('hotel_en', ''),
  378. 'title_zh': extracted_data.get('title_zh', ''),
  379. 'title_en': extracted_data.get('title_en', ''),
  380. 'start_date': datetime.now().strftime('%Y-%m-%d'),
  381. 'image_path': minio_path
  382. }
  383. career_path.append(career_entry)
  384. # 创建新的主名片记录
  385. main_card = BusinessCard(
  386. name_zh=extracted_data.get('name_zh', ''),
  387. name_en=extracted_data.get('name_en', ''),
  388. title_zh=extracted_data.get('title_zh', ''),
  389. title_en=extracted_data.get('title_en', ''),
  390. mobile=mobile,
  391. phone=extracted_data.get('phone', ''),
  392. email=extracted_data.get('email', ''),
  393. hotel_zh=extracted_data.get('hotel_zh', ''),
  394. hotel_en=extracted_data.get('hotel_en', ''),
  395. address_zh=extracted_data.get('address_zh', ''),
  396. address_en=extracted_data.get('address_en', ''),
  397. postal_code_zh=extracted_data.get('postal_code_zh', ''),
  398. postal_code_en=extracted_data.get('postal_code_en', ''),
  399. brand_zh=extracted_data.get('brand_zh', ''),
  400. brand_en=extracted_data.get('brand_en', ''),
  401. affiliation_zh=extracted_data.get('affiliation_zh', ''),
  402. affiliation_en=extracted_data.get('affiliation_en', ''),
  403. brand_group=extracted_data.get('brand_group', ''),
  404. image_path=minio_path,
  405. career_path=career_path,
  406. origin_source={'source': 'manual_upload', 'timestamp': datetime.now().isoformat()},
  407. created_at=datetime.now(),
  408. updated_by='system',
  409. status='active'
  410. )
  411. # 保存主记录到数据库
  412. db.session.add(main_card)
  413. db.session.flush() # 获取主记录的ID
  414. # 创建重复记录标记
  415. suspected_duplicates_data = []
  416. for duplicate_card in suspected_duplicates:
  417. suspected_duplicates_data.append({
  418. 'id': duplicate_card.id,
  419. 'name_zh': duplicate_card.name_zh,
  420. 'mobile': duplicate_card.mobile,
  421. 'hotel_zh': duplicate_card.hotel_zh,
  422. 'title_zh': duplicate_card.title_zh
  423. })
  424. duplicate_record = DuplicateBusinessCard(
  425. main_card_id=main_card.id,
  426. suspected_duplicates=suspected_duplicates_data,
  427. duplicate_reason=reason,
  428. processing_status='pending',
  429. created_at=datetime.now()
  430. )
  431. # 保存重复记录标记
  432. db.session.add(duplicate_record)
  433. db.session.commit()
  434. logging.info(f"成功创建主名片记录 ID: {main_card.id},并标记 {len(suspected_duplicates)} 个疑似重复记录")
  435. return main_card
  436. except Exception as e:
  437. db.session.rollback()
  438. error_msg = f"创建主名片记录失败: {str(e)}"
  439. logging.error(error_msg, exc_info=True)
  440. raise Exception(error_msg)
  441. def get_minio_client():
  442. """获取MinIO客户端连接"""
  443. try:
  444. # 使用全局配置变量
  445. global minio_url, minio_access_key, minio_secret_key, minio_bucket, use_ssl
  446. logging.info(f"尝试连接MinIO服务器: {minio_url}")
  447. minio_client = boto3.client(
  448. 's3',
  449. endpoint_url=minio_url,
  450. aws_access_key_id=minio_access_key,
  451. aws_secret_access_key=minio_secret_key,
  452. config=Config(
  453. signature_version='s3v4',
  454. retries={'max_attempts': 3, 'mode': 'standard'},
  455. connect_timeout=10,
  456. read_timeout=30
  457. )
  458. )
  459. # 确保存储桶存在
  460. buckets = minio_client.list_buckets()
  461. bucket_names = [bucket['Name'] for bucket in buckets.get('Buckets', [])]
  462. logging.info(f"成功连接到MinIO服务器,现有存储桶: {bucket_names}")
  463. if minio_bucket not in bucket_names:
  464. logging.info(f"创建存储桶: {minio_bucket}")
  465. minio_client.create_bucket(Bucket=minio_bucket)
  466. return minio_client
  467. except Exception as e:
  468. logging.error(f"MinIO连接错误: {str(e)}")
  469. return None
  470. def get_business_cards():
  471. """
  472. 获取所有名片记录
  473. Returns:
  474. dict: 包含名片记录列表的字典
  475. """
  476. try:
  477. # 查询所有名片记录,按创建时间倒序排列
  478. cards = BusinessCard.query.filter_by(status='active').order_by(BusinessCard.created_at.desc()).all()
  479. # 转换为字典格式
  480. cards_data = [card.to_dict() for card in cards]
  481. return {
  482. 'code': 200,
  483. 'success': True,
  484. 'message': '获取名片列表成功',
  485. 'data': cards_data,
  486. 'count': len(cards_data)
  487. }
  488. except Exception as e:
  489. error_msg = f"获取名片列表失败: {str(e)}"
  490. logging.error(error_msg, exc_info=True)
  491. return {
  492. 'code': 500,
  493. 'success': False,
  494. 'message': error_msg,
  495. 'data': [],
  496. 'count': 0
  497. }
  498. def get_business_card(card_id):
  499. """
  500. 根据ID获取单个名片记录
  501. Args:
  502. card_id (int): 名片记录ID
  503. Returns:
  504. dict: 包含名片记录的字典
  505. """
  506. try:
  507. card = BusinessCard.query.get(card_id)
  508. if not card:
  509. return {
  510. 'code': 404,
  511. 'success': False,
  512. 'message': f'未找到ID为{card_id}的名片记录',
  513. 'data': None
  514. }
  515. return {
  516. 'code': 200,
  517. 'success': True,
  518. 'message': '获取名片记录成功',
  519. 'data': card.to_dict()
  520. }
  521. except Exception as e:
  522. error_msg = f"获取名片记录失败: {str(e)}"
  523. logging.error(error_msg, exc_info=True)
  524. return {
  525. 'code': 500,
  526. 'success': False,
  527. 'message': error_msg,
  528. 'data': None
  529. }
  530. def update_business_card(card_id, data):
  531. """
  532. 更新名片记录
  533. Args:
  534. card_id (int): 名片记录ID
  535. data (dict): 要更新的数据
  536. Returns:
  537. dict: 包含更新结果的字典
  538. """
  539. try:
  540. card = BusinessCard.query.get(card_id)
  541. if not card:
  542. return {
  543. 'code': 404,
  544. 'success': False,
  545. 'message': f'未找到ID为{card_id}的名片记录',
  546. 'data': None
  547. }
  548. # 更新字段
  549. updatable_fields = ['name_zh', 'name_en', 'title_zh', 'title_en', 'mobile', 'phone', 'email',
  550. 'hotel_zh', 'hotel_en', 'address_zh', 'address_en', 'postal_code_zh', 'postal_code_en',
  551. 'brand_zh', 'brand_en', 'affiliation_zh', 'affiliation_en', 'brand_group', 'talent_profile']
  552. for field in updatable_fields:
  553. if field in data and data[field] is not None:
  554. setattr(card, field, data[field])
  555. # 处理手机号标准化
  556. if 'mobile' in data:
  557. card.mobile = normalize_mobile_numbers(data['mobile'])
  558. card.updated_at = datetime.now()
  559. card.updated_by = data.get('updated_by', 'system')
  560. db.session.commit()
  561. return {
  562. 'code': 200,
  563. 'success': True,
  564. 'message': '名片记录更新成功',
  565. 'data': card.to_dict()
  566. }
  567. except Exception as e:
  568. db.session.rollback()
  569. error_msg = f"更新名片记录失败: {str(e)}"
  570. logging.error(error_msg, exc_info=True)
  571. return {
  572. 'code': 500,
  573. 'success': False,
  574. 'message': error_msg,
  575. 'data': None
  576. }
  577. def update_business_card_status(card_id, status):
  578. """
  579. 更新名片记录状态
  580. Args:
  581. card_id (int): 名片记录ID
  582. status (str): 新的状态
  583. Returns:
  584. dict: 包含更新结果的字典
  585. """
  586. try:
  587. card = BusinessCard.query.get(card_id)
  588. if not card:
  589. return {
  590. 'code': 404,
  591. 'success': False,
  592. 'message': f'未找到ID为{card_id}的名片记录',
  593. 'data': None
  594. }
  595. card.status = status
  596. card.updated_at = datetime.now()
  597. db.session.commit()
  598. return {
  599. 'code': 200,
  600. 'success': True,
  601. 'message': '名片状态更新成功',
  602. 'data': card.to_dict()
  603. }
  604. except Exception as e:
  605. db.session.rollback()
  606. error_msg = f"更新名片状态失败: {str(e)}"
  607. logging.error(error_msg, exc_info=True)
  608. return {
  609. 'code': 500,
  610. 'success': False,
  611. 'message': error_msg,
  612. 'data': None
  613. }
  614. def search_business_cards_by_mobile(mobile_number):
  615. """
  616. 根据手机号搜索名片记录
  617. Args:
  618. mobile_number (str): 手机号码
  619. Returns:
  620. dict: 包含搜索结果的字典
  621. """
  622. try:
  623. if not mobile_number or not mobile_number.strip():
  624. return {
  625. 'code': 400,
  626. 'success': False,
  627. 'message': '手机号码不能为空',
  628. 'data': [],
  629. 'count': 0
  630. }
  631. # 查询包含该手机号的记录
  632. cards = BusinessCard.query.filter(
  633. BusinessCard.mobile.contains(mobile_number.strip())
  634. ).all()
  635. # 转换为字典格式
  636. cards_data = [card.to_dict() for card in cards]
  637. return {
  638. 'code': 200,
  639. 'success': True,
  640. 'message': f'找到 {len(cards_data)} 条匹配记录',
  641. 'data': cards_data,
  642. 'count': len(cards_data)
  643. }
  644. except Exception as e:
  645. error_msg = f"搜索名片记录失败: {str(e)}"
  646. logging.error(error_msg, exc_info=True)
  647. return {
  648. 'code': 500,
  649. 'success': False,
  650. 'message': error_msg,
  651. 'data': [],
  652. 'count': 0
  653. }
  654. # 重复记录管理函数
  655. def get_duplicate_records(status=None):
  656. """
  657. 获取重复记录列表
  658. Args:
  659. status (str, optional): 筛选特定状态的记录
  660. Returns:
  661. dict: 包含操作结果和重复记录列表
  662. """
  663. try:
  664. query = DuplicateBusinessCard.query
  665. if status:
  666. query = query.filter_by(processing_status=status)
  667. duplicate_records = query.order_by(DuplicateBusinessCard.created_at.desc()).all()
  668. records_data = []
  669. for record in duplicate_records:
  670. record_dict = record.to_dict()
  671. if record.main_card:
  672. record_dict['main_card'] = record.main_card.to_dict()
  673. records_data.append(record_dict)
  674. return {
  675. 'code': 200,
  676. 'success': True,
  677. 'message': '获取重复记录列表成功',
  678. 'data': records_data,
  679. 'count': len(records_data)
  680. }
  681. except Exception as e:
  682. error_msg = f"获取重复记录列表失败: {str(e)}"
  683. logging.error(error_msg, exc_info=True)
  684. return {
  685. 'code': 500,
  686. 'success': False,
  687. 'message': error_msg,
  688. 'data': [],
  689. 'count': 0
  690. }
  691. def process_duplicate_record(duplicate_id, action, selected_duplicate_id=None, processed_by=None, notes=None):
  692. """
  693. 处理重复记录
  694. Args:
  695. duplicate_id (int): 重复记录ID
  696. action (str): 处理动作
  697. selected_duplicate_id (int, optional): 选择的重复记录ID
  698. processed_by (str, optional): 处理人
  699. notes (str, optional): 处理备注
  700. Returns:
  701. dict: 包含操作结果
  702. """
  703. try:
  704. duplicate_record = DuplicateBusinessCard.query.filter_by(main_card_id=duplicate_id).first()
  705. if not duplicate_record:
  706. return {
  707. 'code': 404,
  708. 'success': False,
  709. 'message': f'未找到main_card_id为{duplicate_id}的重复记录',
  710. 'data': None
  711. }
  712. if duplicate_record.processing_status != 'pending':
  713. return {
  714. 'code': 400,
  715. 'success': False,
  716. 'message': f'重复记录状态为{duplicate_record.processing_status},无法处理',
  717. 'data': None
  718. }
  719. main_card = duplicate_record.main_card
  720. if not main_card:
  721. return {
  722. 'code': 404,
  723. 'success': False,
  724. 'message': '未找到对应的主记录',
  725. 'data': None
  726. }
  727. result_data = None
  728. if action == 'merge_to_suspected':
  729. if not selected_duplicate_id:
  730. return {
  731. 'code': 400,
  732. 'success': False,
  733. 'message': '执行合并操作时必须提供selected_duplicate_id',
  734. 'data': None
  735. }
  736. target_card = BusinessCard.query.get(selected_duplicate_id)
  737. if not target_card:
  738. return {
  739. 'code': 404,
  740. 'success': False,
  741. 'message': f'未找到ID为{selected_duplicate_id}的目标记录',
  742. 'data': None
  743. }
  744. # 合并信息到目标记录
  745. target_card.name_en = main_card.name_en or target_card.name_en
  746. target_card.title_zh = main_card.title_zh or target_card.title_zh
  747. target_card.title_en = main_card.title_en or target_card.title_en
  748. if main_card.mobile:
  749. target_card.mobile = merge_mobile_numbers(target_card.mobile, main_card.mobile)
  750. target_card.phone = main_card.phone or target_card.phone
  751. target_card.email = main_card.email or target_card.email
  752. target_card.hotel_zh = main_card.hotel_zh or target_card.hotel_zh
  753. target_card.hotel_en = main_card.hotel_en or target_card.hotel_en
  754. target_card.address_zh = main_card.address_zh or target_card.address_zh
  755. target_card.address_en = main_card.address_en or target_card.address_en
  756. target_card.brand_group = main_card.brand_group or target_card.brand_group
  757. target_card.image_path = main_card.image_path
  758. target_card.updated_by = processed_by or 'system'
  759. new_data = {
  760. 'hotel_zh': main_card.hotel_zh,
  761. 'hotel_en': main_card.hotel_en,
  762. 'title_zh': main_card.title_zh,
  763. 'title_en': main_card.title_en
  764. }
  765. target_card.career_path = update_career_path(target_card, new_data, main_card.image_path)
  766. db.session.delete(duplicate_record)
  767. db.session.delete(main_card)
  768. result_data = target_card.to_dict()
  769. elif action == 'keep_main':
  770. result_data = main_card.to_dict()
  771. elif action == 'ignore':
  772. result_data = main_card.to_dict()
  773. if action != 'merge_to_suspected':
  774. duplicate_record.processing_status = 'processed'
  775. duplicate_record.processed_at = datetime.now()
  776. duplicate_record.processed_by = processed_by or 'system'
  777. duplicate_record.processing_notes = notes or f'执行操作: {action}'
  778. db.session.commit()
  779. return {
  780. 'code': 200,
  781. 'success': True,
  782. 'message': f'重复记录处理成功,操作: {action}',
  783. 'data': {
  784. 'duplicate_record': duplicate_record.to_dict() if action != 'merge_to_suspected' else None,
  785. 'result': result_data
  786. }
  787. }
  788. except Exception as e:
  789. db.session.rollback()
  790. error_msg = f"处理重复记录失败: {str(e)}"
  791. logging.error(error_msg, exc_info=True)
  792. return {
  793. 'code': 500,
  794. 'success': False,
  795. 'message': error_msg,
  796. 'data': None
  797. }
  798. def get_duplicate_record_detail(duplicate_id):
  799. """
  800. 获取重复记录详情
  801. Args:
  802. duplicate_id (int): 重复记录ID
  803. Returns:
  804. dict: 包含重复记录详细信息
  805. """
  806. try:
  807. duplicate_record = DuplicateBusinessCard.query.filter_by(main_card_id=duplicate_id).first()
  808. if not duplicate_record:
  809. return {
  810. 'code': 404,
  811. 'success': False,
  812. 'message': f'未找到main_card_id为{duplicate_id}的重复记录',
  813. 'data': None
  814. }
  815. record_dict = duplicate_record.to_dict()
  816. if duplicate_record.main_card:
  817. record_dict['main_card'] = duplicate_record.main_card.to_dict()
  818. else:
  819. record_dict['main_card'] = None
  820. suspected_duplicates_details = []
  821. if duplicate_record.suspected_duplicates:
  822. for suspected_item in duplicate_record.suspected_duplicates:
  823. try:
  824. if isinstance(suspected_item, dict):
  825. card_id = suspected_item.get('id')
  826. else:
  827. card_id = suspected_item
  828. if card_id:
  829. card_result = get_business_card(card_id)
  830. if card_result['success']:
  831. suspected_duplicates_details.append(card_result['data'])
  832. except Exception as e:
  833. logging.warning(f"获取疑似重复记录详情失败: {str(e)}")
  834. continue
  835. record_dict['suspected_duplicates_details'] = suspected_duplicates_details
  836. return {
  837. 'code': 200,
  838. 'success': True,
  839. 'message': '获取重复记录详情成功',
  840. 'data': record_dict
  841. }
  842. except Exception as e:
  843. error_msg = f"获取重复记录详情失败: {str(e)}"
  844. logging.error(error_msg, exc_info=True)
  845. return {
  846. 'code': 500,
  847. 'success': False,
  848. 'message': error_msg,
  849. 'data': None
  850. }
  851. def fix_broken_duplicate_records():
  852. """
  853. 修复损坏的重复记录
  854. Returns:
  855. dict: 包含修复结果
  856. """
  857. try:
  858. broken_records = DuplicateBusinessCard.query.filter_by(main_card_id=None).all()
  859. fixed_count = 0
  860. for record in broken_records:
  861. db.session.delete(record)
  862. fixed_count += 1
  863. db.session.commit()
  864. return {
  865. 'code': 200,
  866. 'success': True,
  867. 'message': f'成功修复 {fixed_count} 条损坏的重复记录',
  868. 'data': {'fixed_count': fixed_count}
  869. }
  870. except Exception as e:
  871. db.session.rollback()
  872. error_msg = f"修复重复记录失败: {str(e)}"
  873. logging.error(error_msg, exc_info=True)
  874. return {
  875. 'code': 500,
  876. 'success': False,
  877. 'message': error_msg,
  878. 'data': None
  879. }
  880. def get_parse_tasks(page=1, per_page=10, task_type=None, task_status=None):
  881. """
  882. 获取解析任务列表
  883. Args:
  884. page (int): 页码
  885. per_page (int): 每页记录数
  886. task_type (str): 任务类型过滤
  887. task_status (str): 任务状态过滤
  888. Returns:
  889. dict: 包含查询结果和分页信息
  890. """
  891. try:
  892. if page < 1 or per_page < 1 or per_page > 100:
  893. return {
  894. 'code': 400,
  895. 'success': False,
  896. 'message': '分页参数错误',
  897. 'data': None
  898. }
  899. query = ParseTaskRepository.query
  900. if task_type:
  901. query = query.filter_by(task_type=task_type)
  902. if task_status:
  903. query = query.filter_by(task_status=task_status)
  904. query = query.order_by(ParseTaskRepository.created_at.desc())
  905. pagination = query.paginate(page=page, per_page=per_page, error_out=False)
  906. tasks = [task.to_dict() for task in pagination.items]
  907. return {
  908. 'code': 200,
  909. 'success': True,
  910. 'message': '获取解析任务列表成功',
  911. 'data': {
  912. 'tasks': tasks,
  913. 'pagination': {
  914. 'page': page,
  915. 'per_page': per_page,
  916. 'total': pagination.total,
  917. 'pages': pagination.pages,
  918. 'has_next': pagination.has_next,
  919. 'has_prev': pagination.has_prev
  920. }
  921. }
  922. }
  923. except Exception as e:
  924. error_msg = f"获取解析任务列表失败: {str(e)}"
  925. logging.error(error_msg, exc_info=True)
  926. return {
  927. 'code': 500,
  928. 'success': False,
  929. 'message': error_msg,
  930. 'data': None
  931. }
  932. def get_parse_task_detail(task_name):
  933. """
  934. 获取解析任务详情
  935. Args:
  936. task_name (str): 任务名称
  937. Returns:
  938. dict: 包含查询结果
  939. """
  940. try:
  941. if not task_name:
  942. return {
  943. 'code': 400,
  944. 'success': False,
  945. 'message': '任务名称不能为空',
  946. 'data': None
  947. }
  948. task = ParseTaskRepository.query.filter_by(task_name=task_name).first()
  949. if not task:
  950. return {
  951. 'code': 404,
  952. 'success': False,
  953. 'message': f'未找到任务名称为 {task_name} 的记录',
  954. 'data': None
  955. }
  956. return {
  957. 'code': 200,
  958. 'success': True,
  959. 'message': f'成功获取任务 {task_name} 的详细信息',
  960. 'data': task.to_dict()
  961. }
  962. except Exception as e:
  963. error_msg = f"获取解析任务详情失败: {str(e)}"
  964. logging.error(error_msg, exc_info=True)
  965. return {
  966. 'code': 500,
  967. 'success': False,
  968. 'message': error_msg,
  969. 'data': None
  970. }
  971. def create_talent_tag(tag_data):
  972. """
  973. 创建人才标签节点
  974. Args:
  975. tag_data: 包含标签信息的字典,包括:
  976. - name: 标签名称
  977. - category: 标签分类
  978. - description: 标签描述
  979. - status: 启用状态
  980. Returns:
  981. dict: 操作结果字典
  982. """
  983. try:
  984. from app.services.neo4j_driver import neo4j_driver
  985. # 验证必要参数存在
  986. if not tag_data or 'name' not in tag_data or not tag_data['name']:
  987. return {
  988. 'code': 400,
  989. 'success': False,
  990. 'message': '标签名称为必填项',
  991. 'data': None
  992. }
  993. # 准备节点属性
  994. tag_properties = {
  995. 'name': tag_data.get('name'),
  996. 'category': tag_data.get('category', '未分类'),
  997. 'describe': tag_data.get('description', ''), # 使用describe与现有系统保持一致
  998. 'status': tag_data.get('status', 'active'),
  999. 'time': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
  1000. }
  1001. # 生成标签的英文名(可选)
  1002. from app.core.graph.graph_operations import create_or_get_node
  1003. # 如果提供了名称,尝试获取英文翻译
  1004. if 'name' in tag_data and tag_data['name']:
  1005. try:
  1006. from app.api.data_interface.routes import translate_and_parse
  1007. en_name = translate_and_parse(tag_data['name'])
  1008. tag_properties['en_name'] = en_name[0] if en_name and isinstance(en_name, list) else ''
  1009. except Exception as e:
  1010. logging.warning(f"获取标签英文名失败: {str(e)}")
  1011. tag_properties['en_name'] = ''
  1012. # 创建节点
  1013. node_id = create_or_get_node('DataLabel', **tag_properties)
  1014. if node_id:
  1015. return {
  1016. 'code': 200,
  1017. 'success': True,
  1018. 'message': '人才标签创建成功',
  1019. 'data': {
  1020. 'id': node_id,
  1021. **tag_properties
  1022. }
  1023. }
  1024. else:
  1025. return {
  1026. 'code': 500,
  1027. 'success': False,
  1028. 'message': '人才标签创建失败',
  1029. 'data': None
  1030. }
  1031. except Exception as e:
  1032. logging.error(f"创建人才标签失败: {str(e)}", exc_info=True)
  1033. return {
  1034. 'code': 500,
  1035. 'success': False,
  1036. 'message': f'创建人才标签失败: {str(e)}',
  1037. 'data': None
  1038. }
  1039. def get_talent_tag_list():
  1040. """
  1041. 从Neo4j图数据库获取人才标签列表
  1042. Returns:
  1043. dict: 包含操作结果和标签列表的字典
  1044. """
  1045. try:
  1046. from app.services.neo4j_driver import neo4j_driver
  1047. # 构建Cypher查询语句,获取分类为talent的标签
  1048. query = """
  1049. MATCH (n:DataLabel)
  1050. WHERE n.category CONTAINS 'talent' OR n.category CONTAINS '人才'
  1051. RETURN id(n) as id, n.name as name, n.en_name as en_name,
  1052. n.category as category, n.describe as description,
  1053. n.status as status, n.time as time
  1054. ORDER BY n.time DESC
  1055. """
  1056. # 执行查询
  1057. tags = []
  1058. with neo4j_driver.get_session() as session:
  1059. result = session.run(query)
  1060. # 处理查询结果
  1061. for record in result:
  1062. tag = {
  1063. 'id': record['id'],
  1064. 'name': record['name'],
  1065. 'en_name': record['en_name'],
  1066. 'category': record['category'],
  1067. 'description': record['description'],
  1068. 'status': record['status'],
  1069. 'time': record['time']
  1070. }
  1071. tags.append(tag)
  1072. return {
  1073. 'code': 200,
  1074. 'success': True,
  1075. 'message': '获取人才标签列表成功',
  1076. 'data': tags
  1077. }
  1078. except Exception as e:
  1079. error_msg = f"获取人才标签列表失败: {str(e)}"
  1080. logging.error(error_msg, exc_info=True)
  1081. return {
  1082. 'code': 500,
  1083. 'success': False,
  1084. 'message': error_msg,
  1085. 'data': []
  1086. }
  1087. def update_talent_tag(tag_id, tag_data):
  1088. """
  1089. 更新人才标签节点属性
  1090. Args:
  1091. tag_id: 标签节点ID
  1092. tag_data: 包含更新信息的字典,可能包括:
  1093. - name: 标签名称
  1094. - category: 标签分类
  1095. - description: 标签描述
  1096. - status: 启用状态
  1097. Returns:
  1098. dict: 操作结果字典
  1099. """
  1100. try:
  1101. from app.services.neo4j_driver import neo4j_driver
  1102. # 准备要更新的属性
  1103. update_properties = {}
  1104. # 检查并添加需要更新的属性
  1105. if 'name' in tag_data and tag_data['name']:
  1106. update_properties['name'] = tag_data['name']
  1107. # 如果名称更新了,尝试更新英文名称
  1108. try:
  1109. from app.api.data_interface.routes import translate_and_parse
  1110. en_name = translate_and_parse(tag_data['name'])
  1111. update_properties['en_name'] = en_name[0] if en_name and isinstance(en_name, list) else ''
  1112. except Exception as e:
  1113. logging.warning(f"更新标签英文名失败: {str(e)}")
  1114. if 'category' in tag_data and tag_data['category']:
  1115. update_properties['category'] = tag_data['category']
  1116. if 'description' in tag_data:
  1117. update_properties['describe'] = tag_data['description']
  1118. if 'status' in tag_data:
  1119. update_properties['status'] = tag_data['status']
  1120. # 添加更新时间
  1121. update_properties['time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
  1122. # 如果没有可更新的属性,返回错误
  1123. if not update_properties:
  1124. return {
  1125. 'code': 400,
  1126. 'success': False,
  1127. 'message': '未提供任何可更新的属性',
  1128. 'data': None
  1129. }
  1130. # 构建更新的Cypher查询
  1131. set_clauses = []
  1132. params = {'nodeId': tag_id}
  1133. for key, value in update_properties.items():
  1134. param_name = f"param_{key}"
  1135. set_clauses.append(f"n.{key} = ${param_name}")
  1136. params[param_name] = value
  1137. set_clause = ", ".join(set_clauses)
  1138. query = f"""
  1139. MATCH (n:DataLabel)
  1140. WHERE id(n) = $nodeId
  1141. SET {set_clause}
  1142. RETURN id(n) as id, n.name as name, n.en_name as en_name,
  1143. n.category as category, n.describe as description,
  1144. n.status as status, n.time as time
  1145. """
  1146. # 执行更新查询
  1147. with neo4j_driver.get_session() as session:
  1148. result = session.run(query, **params)
  1149. record = result.single()
  1150. if not record:
  1151. return {
  1152. 'code': 404,
  1153. 'success': False,
  1154. 'message': f'未找到ID为{tag_id}的标签',
  1155. 'data': None
  1156. }
  1157. # 提取更新后的标签信息
  1158. updated_tag = {
  1159. 'id': record['id'],
  1160. 'name': record['name'],
  1161. 'en_name': record['en_name'],
  1162. 'category': record['category'],
  1163. 'description': record['description'],
  1164. 'status': record['status'],
  1165. 'time': record['time']
  1166. }
  1167. return {
  1168. 'code': 200,
  1169. 'success': True,
  1170. 'message': '人才标签更新成功',
  1171. 'data': updated_tag
  1172. }
  1173. except Exception as e:
  1174. error_msg = f"更新人才标签失败: {str(e)}"
  1175. logging.error(error_msg, exc_info=True)
  1176. return {
  1177. 'code': 500,
  1178. 'success': False,
  1179. 'message': error_msg,
  1180. 'data': None
  1181. }
  1182. def delete_talent_tag(tag_id):
  1183. """
  1184. 删除人才标签节点及其相关关系
  1185. Args:
  1186. tag_id: 标签节点ID
  1187. Returns:
  1188. dict: 操作结果字典
  1189. """
  1190. try:
  1191. from app.services.neo4j_driver import neo4j_driver
  1192. # 首先获取要删除的标签信息,以便在成功后返回
  1193. get_query = """
  1194. MATCH (n:DataLabel)
  1195. WHERE id(n) = $nodeId
  1196. RETURN id(n) as id, n.name as name, n.en_name as en_name,
  1197. n.category as category, n.describe as description,
  1198. n.status as status, n.time as time
  1199. """
  1200. # 构建删除节点和关系的Cypher查询
  1201. delete_query = """
  1202. MATCH (n:DataLabel)
  1203. WHERE id(n) = $nodeId
  1204. OPTIONAL MATCH (n)-[r]-()
  1205. DELETE r, n
  1206. RETURN count(n) AS deleted
  1207. """
  1208. # 执行查询
  1209. tag_info = None
  1210. with neo4j_driver.get_session() as session:
  1211. # 先获取标签信息
  1212. result = session.run(get_query, nodeId=tag_id)
  1213. record = result.single()
  1214. if not record:
  1215. return {
  1216. 'code': 404,
  1217. 'success': False,
  1218. 'message': f'未找到ID为{tag_id}的标签',
  1219. 'data': None
  1220. }
  1221. # 保存标签信息用于返回
  1222. tag_info = {
  1223. 'id': record['id'],
  1224. 'name': record['name'],
  1225. 'en_name': record['en_name'],
  1226. 'category': record['category'],
  1227. 'description': record['description'],
  1228. 'status': record['status'],
  1229. 'time': record['time']
  1230. }
  1231. # 执行删除操作
  1232. delete_result = session.run(delete_query, nodeId=tag_id)
  1233. deleted = delete_result.single()['deleted']
  1234. if deleted > 0:
  1235. return {
  1236. 'code': 200,
  1237. 'success': True,
  1238. 'message': '人才标签删除成功',
  1239. 'data': tag_info
  1240. }
  1241. else:
  1242. return {
  1243. 'code': 404,
  1244. 'success': False,
  1245. 'message': f'未能删除ID为{tag_id}的标签',
  1246. 'data': None
  1247. }
  1248. except Exception as e:
  1249. error_msg = f"删除人才标签失败: {str(e)}"
  1250. logging.error(error_msg, exc_info=True)
  1251. return {
  1252. 'code': 500,
  1253. 'success': False,
  1254. 'message': error_msg,
  1255. 'data': None
  1256. }
  1257. def query_neo4j_graph(query_requirement):
  1258. """
  1259. 查询Neo4j图数据库,通过Deepseek API生成Cypher脚本
  1260. Args:
  1261. query_requirement (str): 查询需求描述
  1262. Returns:
  1263. dict: 包含查询结果的字典,JSON格式
  1264. """
  1265. try:
  1266. # 导入必要的模块
  1267. from app.services.neo4j_driver import neo4j_driver
  1268. import requests
  1269. import json
  1270. # Deepseek API配置
  1271. api_key = DEEPSEEK_API_KEY
  1272. api_url = DEEPSEEK_API_URL
  1273. # 步骤1: 从Neo4j获取所有标签列表
  1274. logging.info("第一步:从Neo4j获取人才类别的标签列表")
  1275. all_labels_query = """
  1276. MATCH (dl:DataLabel)
  1277. WHERE dl.category CONTAINS '人才' OR dl.category CONTAINS 'talent'
  1278. RETURN dl.name as name
  1279. """
  1280. all_labels = []
  1281. with neo4j_driver.get_session() as session:
  1282. result = session.run(all_labels_query)
  1283. for record in result:
  1284. all_labels.append(record['name'])
  1285. logging.info(f"获取到{len(all_labels)}个人才标签: {all_labels}")
  1286. # 步骤2: 使用Deepseek判断查询需求中的关键信息与标签的对应关系
  1287. logging.info("第二步:调用Deepseek API匹配查询需求与标签")
  1288. # 构建所有标签的JSON字符串
  1289. labels_json = json.dumps(all_labels, ensure_ascii=False)
  1290. # 构建匹配标签的提示语
  1291. matching_prompt = f"""
  1292. 请分析以下查询需求,并从标签列表中找出与查询需求相关的标签。
  1293. ## 查询需求
  1294. {query_requirement}
  1295. ## 可用标签列表
  1296. {labels_json}
  1297. ## 输出要求
  1298. 1. 请以JSON数组格式返回匹配的标签名称列表,格式如: ["标签1", "标签2", "标签3"]
  1299. 2. 只返回标签名称数组,不要包含任何解释或其他文本
  1300. 3. 如果没有找到匹配的标签,请返回空数组 []
  1301. """
  1302. # 调用Deepseek API匹配标签
  1303. headers = {
  1304. "Authorization": f"Bearer {api_key}",
  1305. "Content-Type": "application/json"
  1306. }
  1307. payload = {
  1308. "model": "deepseek-chat",
  1309. "messages": [
  1310. {"role": "system", "content": "你是一个专业的文本分析和匹配专家。"},
  1311. {"role": "user", "content": matching_prompt}
  1312. ],
  1313. "temperature": 0.1,
  1314. "response_format": {"type": "json_object"}
  1315. }
  1316. logging.info("发送请求到Deepseek API匹配标签:"+matching_prompt)
  1317. response = requests.post(api_url, headers=headers, json=payload, timeout=30)
  1318. response.raise_for_status()
  1319. # 解析API响应
  1320. result = response.json()
  1321. matching_content = result.get("choices", [{}])[0].get("message", {}).get("content", "[]")
  1322. # 提取JSON数组
  1323. try:
  1324. # 尝试直接解析返回结果,预期格式为 ["新开酒店经验", "五星级酒店", "总经理"]
  1325. logging.info(f"Deepseek返回的匹配内容: {matching_content}")
  1326. # 如果返回的是JSON字符串,先去除可能的前后缀文本
  1327. if isinstance(matching_content, str):
  1328. # 查找JSON数组的开始和结束位置
  1329. start_idx = matching_content.find('[')
  1330. end_idx = matching_content.rfind(']') + 1
  1331. if start_idx >= 0 and end_idx > start_idx:
  1332. json_str = matching_content[start_idx:end_idx]
  1333. matched_labels = json.loads(json_str)
  1334. else:
  1335. matched_labels = []
  1336. else:
  1337. matched_labels = []
  1338. # 确保结果是字符串列表
  1339. if matched_labels and all(isinstance(item, str) for item in matched_labels):
  1340. logging.info(f"成功解析到标签列表: {matched_labels}")
  1341. else:
  1342. logging.warning("解析结果不是预期的字符串列表格式,将使用空列表")
  1343. matched_labels = []
  1344. except json.JSONDecodeError as e:
  1345. logging.error(f"JSON解析错误: {str(e)}")
  1346. matched_labels = []
  1347. except Exception as e:
  1348. logging.error(f"解析匹配标签时出错: {str(e)}")
  1349. matched_labels = []
  1350. logging.info(f"匹配到的标签: {matched_labels}")
  1351. # 如果没有匹配到标签,返回空结果
  1352. if not matched_labels:
  1353. return {
  1354. 'code': 200,
  1355. 'success': True,
  1356. 'message': '未找到与查询需求匹配的标签',
  1357. 'query': '',
  1358. 'data': []
  1359. }
  1360. # 步骤3: 构建Cypher生成提示文本
  1361. logging.info("第三步:构建提示文本生成Cypher查询语句")
  1362. # 将匹配的标签转换为字符串
  1363. matched_labels_str = ", ".join([f"'{label}'" for label in matched_labels])
  1364. # 构建生成Cypher的提示语
  1365. cypher_prompt = f"""
  1366. 请根据以下Neo4j图数据库结构和已匹配的标签,生成一个Cypher查询脚本。
  1367. ## 图数据库结构
  1368. ### 节点
  1369. 1. talent - 人才节点
  1370. 属性: pg_id(PostgreSQL数据库ID), name_zh(中文姓名), name_en(英文姓名),
  1371. mobile(手机号码), email(电子邮箱), updated_at(更新时间)
  1372. 2. DataLabel - 人才标签节点
  1373. ### 关系
  1374. BELONGS_TO - 从属关系
  1375. (talent)-[BELONGS_TO]->(DataLabel) - 人才属于某标签
  1376. ## 匹配的标签列表
  1377. [{matched_labels_str}]
  1378. ## 查询需求
  1379. {query_requirement}
  1380. ## 输出要求
  1381. 1. 只输出有效的Cypher查询语句,不要包含任何解释或注释
  1382. 2. 确保return语句中包含talent节点属性
  1383. 3. 尽量利用图数据库的特性来优化查询效率
  1384. 4. 使用WITH子句和COLLECT函数收集标签,确保查询到同时拥有所有标签的人才
  1385. 注意:请直接返回Cypher查询语句,无需任何其他文本。
  1386. 以下是一个示例:
  1387. 假设匹配的标签是 ['五星级酒店', '新开酒店经验', '总经理']
  1388. 生成的Cypher查询语句应该是:
  1389. MATCH (t:talent)-[:BELONGS_TO]->(dl:DataLabel)
  1390. WHERE dl.name IN ['五星级酒店', '新开酒店经验', '总经理']
  1391. WITH t, COLLECT(DISTINCT dl.name) AS labels
  1392. WHERE size(labels) = 3
  1393. RETURN t.pg_id as pg_id, t.name_zh as name_zh, t.name_en as name_en, t.mobile as mobile, t.email as email, t.updated_at as updated_at
  1394. """
  1395. # 调用Deepseek API生成Cypher脚本
  1396. payload = {
  1397. "model": "deepseek-chat",
  1398. "messages": [
  1399. {"role": "system", "content": "你是一个专业的Neo4j Cypher查询专家。"},
  1400. {"role": "user", "content": cypher_prompt}
  1401. ],
  1402. "temperature": 0.1
  1403. }
  1404. logging.info("发送请求到Deepseek API生成Cypher脚本")
  1405. response = requests.post(api_url, headers=headers, json=payload, timeout=30)
  1406. response.raise_for_status()
  1407. # 解析API响应
  1408. result = response.json()
  1409. cypher_script = result.get("choices", [{}])[0].get("message", {}).get("content", "")
  1410. # 清理Cypher脚本,移除不必要的markdown格式或注释
  1411. cypher_script = cypher_script.strip()
  1412. if cypher_script.startswith("```cypher"):
  1413. cypher_script = cypher_script[9:]
  1414. elif cypher_script.startswith("```"):
  1415. cypher_script = cypher_script[3:]
  1416. if cypher_script.endswith("```"):
  1417. cypher_script = cypher_script[:-3]
  1418. cypher_script = cypher_script.strip()
  1419. logging.info(f"生成的Cypher脚本: {cypher_script}")
  1420. # 步骤4: 执行Cypher脚本
  1421. logging.info("第四步:执行Cypher脚本并返回结果")
  1422. with neo4j_driver.get_session() as session:
  1423. result = session.run(cypher_script)
  1424. records = [record.data() for record in result]
  1425. # 构建查询结果
  1426. response_data = {
  1427. 'code': 200,
  1428. 'success': True,
  1429. 'message': '查询成功执行',
  1430. 'query': cypher_script,
  1431. 'matched_labels': matched_labels,
  1432. 'data': records
  1433. }
  1434. return response_data
  1435. except requests.exceptions.HTTPError as e:
  1436. error_msg = f"调用Deepseek API失败: {str(e)}"
  1437. logging.error(error_msg)
  1438. if hasattr(e, 'response') and e.response:
  1439. logging.error(f"错误状态码: {e.response.status_code}")
  1440. logging.error(f"错误内容: {e.response.text}")
  1441. return {
  1442. 'code': 500,
  1443. 'success': False,
  1444. 'message': error_msg,
  1445. 'data': []
  1446. }
  1447. except Exception as e:
  1448. error_msg = f"查询Neo4j图数据库失败: {str(e)}"
  1449. logging.error(error_msg, exc_info=True)
  1450. return {
  1451. 'code': 500,
  1452. 'success': False,
  1453. 'message': error_msg,
  1454. 'data': []
  1455. }
  1456. def talent_get_tags(talent_id):
  1457. """
  1458. 根据talent ID获取人才节点关联的标签
  1459. Args:
  1460. talent_id (int): 人才节点pg_id
  1461. Returns:
  1462. dict: 包含人才ID和关联标签的字典,JSON格式
  1463. """
  1464. try:
  1465. # 导入必要的模块
  1466. from app.services.neo4j_driver import neo4j_driver
  1467. # 准备查询返回数据
  1468. response_data = {
  1469. 'code': 200,
  1470. 'success': True,
  1471. 'message': '获取人才标签成功',
  1472. 'data': []
  1473. }
  1474. # 构建Cypher查询语句,获取人才节点关联的标签
  1475. cypher_query = """
  1476. MATCH (t:talent)-[r:BELONGS_TO]->(tag:DataLabel)
  1477. WHERE t.pg_id = $talent_id
  1478. RETURN t.pg_id as talent_id, tag.name as tag_name
  1479. """
  1480. # 执行查询
  1481. with neo4j_driver.get_session() as session:
  1482. result = session.run(cypher_query, talent_id=int(talent_id))
  1483. records = list(result)
  1484. # 如果没有查询到标签,返回空数组
  1485. if not records:
  1486. response_data['message'] = f'人才pg_id {talent_id} 没有关联的标签'
  1487. return response_data
  1488. # 处理查询结果
  1489. for record in records:
  1490. talent_tag = {
  1491. 'talent': record['talent_id'],
  1492. 'tag': record['tag_name']
  1493. }
  1494. response_data['data'].append(talent_tag)
  1495. return response_data
  1496. except Exception as e:
  1497. error_msg = f"获取人才标签失败: {str(e)}"
  1498. logging.error(error_msg, exc_info=True)
  1499. return {
  1500. 'code': 500,
  1501. 'success': False,
  1502. 'message': error_msg,
  1503. 'data': []
  1504. }
  1505. def talent_update_tags(data):
  1506. """
  1507. 根据传入的JSON数据为人才节点创建与标签的BELONGS_TO关系
  1508. Args:
  1509. data (list): 包含talent和tag字段的对象列表
  1510. 例如: [
  1511. {"talent": 12345, "tag": "市场营销"},
  1512. {"talent": 12345, "tag": "酒店管理"}
  1513. ]
  1514. Returns:
  1515. dict: 操作结果和状态信息
  1516. """
  1517. try:
  1518. # 导入必要的模块
  1519. from app.services.neo4j_driver import neo4j_driver
  1520. # 验证输入参数
  1521. if not isinstance(data, list):
  1522. return {
  1523. 'code': 400,
  1524. 'success': False,
  1525. 'message': '参数格式错误,需要JSON数组',
  1526. 'data': None
  1527. }
  1528. if len(data) == 0:
  1529. return {
  1530. 'code': 400,
  1531. 'success': False,
  1532. 'message': '数据列表为空',
  1533. 'data': None
  1534. }
  1535. # 获取当前时间
  1536. current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
  1537. # 成功和失败计数
  1538. success_count = 0
  1539. failed_items = []
  1540. # 按talent分组处理数据
  1541. talent_tags = {}
  1542. for item in data:
  1543. # 验证每个项目的格式
  1544. if not isinstance(item, dict) or 'talent' not in item or 'tag' not in item:
  1545. failed_items.append(item)
  1546. continue
  1547. talent_id = item.get('talent')
  1548. tag_name = item.get('tag')
  1549. # 验证talent_id和tag_name的值
  1550. if not talent_id or not tag_name or not isinstance(tag_name, str):
  1551. failed_items.append(item)
  1552. continue
  1553. # 按talent_id分组
  1554. if talent_id not in talent_tags:
  1555. talent_tags[talent_id] = []
  1556. talent_tags[talent_id].append(tag_name)
  1557. with neo4j_driver.get_session() as session:
  1558. # 处理每个talent及其标签
  1559. for talent_id, tags in talent_tags.items():
  1560. # 首先验证talent节点是否存在
  1561. check_talent_query = """
  1562. MATCH (t:talent)
  1563. WHERE t.pg_id = $talent_id
  1564. RETURN t
  1565. """
  1566. talent_result = session.run(check_talent_query, talent_id=int(talent_id))
  1567. if not talent_result.single():
  1568. # 该talent不存在,记录失败项并继续下一个talent
  1569. for tag in tags:
  1570. failed_items.append({'talent_pg_id': talent_id, 'tag': tag})
  1571. continue
  1572. # 首先清除所有现有的BELONGS_TO关系
  1573. clear_relations_query = """
  1574. MATCH (t:talent)-[r:BELONGS_TO]->(:DataLabel)
  1575. WHERE t.pg_id = $talent_id
  1576. DELETE r
  1577. RETURN count(r) as deleted_count
  1578. """
  1579. clear_result = session.run(clear_relations_query, talent_id=int(talent_id))
  1580. deleted_count = clear_result.single()['deleted_count']
  1581. logging.info(f"已删除talent_id={talent_id}的{deleted_count}个已有标签关系")
  1582. # 处理每个标签
  1583. for tag_name in tags:
  1584. try:
  1585. # 1. 查找或创建标签节点
  1586. # 先查找是否存在该标签
  1587. find_tag_query = """
  1588. MATCH (tag:DataLabel)
  1589. WHERE tag.name = $tag_name
  1590. RETURN id(tag) as tag_id
  1591. """
  1592. tag_result = session.run(find_tag_query, tag_name=tag_name)
  1593. tag_record = tag_result.single()
  1594. if tag_record:
  1595. tag_id = tag_record['tag_id']
  1596. else:
  1597. # 创建新标签
  1598. create_tag_query = """
  1599. CREATE (tag:DataLabel {name: $name, category: $category, updated_at: $updated_at})
  1600. RETURN id(tag) as tag_id
  1601. """
  1602. tag_result = session.run(
  1603. create_tag_query,
  1604. name=tag_name,
  1605. category='talent',
  1606. updated_at=current_time
  1607. )
  1608. tag_record = tag_result.single()
  1609. tag_id = tag_record['tag_id']
  1610. # 2. 创建人才与标签的BELONGS_TO关系
  1611. create_relation_query = """
  1612. MATCH (t:talent), (tag:DataLabel)
  1613. WHERE t.pg_id = $talent_id AND tag.name = $tag_name
  1614. CREATE (t)-[r:BELONGS_TO]->(tag)
  1615. SET r.created_at = $current_time
  1616. RETURN r
  1617. """
  1618. relation_result = session.run(
  1619. create_relation_query,
  1620. talent_id=int(talent_id),
  1621. tag_name=tag_name,
  1622. current_time=current_time
  1623. )
  1624. if relation_result.single():
  1625. success_count += 1
  1626. else:
  1627. failed_items.append({'talent_pg_id': talent_id, 'tag': tag_name})
  1628. except Exception as tag_error:
  1629. logging.error(f"为标签 {tag_name} 创建关系时出错: {str(tag_error)}")
  1630. failed_items.append({'talent_pg_id': talent_id, 'tag': tag_name})
  1631. # 返回结果
  1632. total_items = len(data)
  1633. if success_count == total_items:
  1634. return {
  1635. 'code': 200,
  1636. 'success': True,
  1637. 'message': f'成功创建或更新了 {success_count} 个标签关系',
  1638. 'data': {
  1639. 'success_count': success_count,
  1640. 'total_count': total_items,
  1641. 'failed_items': []
  1642. }
  1643. }
  1644. elif success_count > 0:
  1645. return {
  1646. 'code': 206, # Partial Content
  1647. 'success': True,
  1648. 'message': f'部分成功: 创建或更新了 {success_count}/{total_items} 个标签关系',
  1649. 'data': {
  1650. 'success_count': success_count,
  1651. 'total_count': total_items,
  1652. 'failed_items': failed_items
  1653. }
  1654. }
  1655. else:
  1656. return {
  1657. 'code': 500,
  1658. 'success': False,
  1659. 'message': '无法创建任何标签关系',
  1660. 'data': {
  1661. 'success_count': 0,
  1662. 'total_count': total_items,
  1663. 'failed_items': failed_items
  1664. }
  1665. }
  1666. except Exception as e:
  1667. error_msg = f"更新人才标签关系失败: {str(e)}"
  1668. logging.error(error_msg, exc_info=True)
  1669. return {
  1670. 'code': 500,
  1671. 'success': False,
  1672. 'message': error_msg,
  1673. 'data': None
  1674. }
  1675. def parse_text_with_qwen25VLplus(image_data):
  1676. """
  1677. 使用阿里云的 Qwen VL Max 模型解析图像中的名片信息
  1678. Args:
  1679. image_data (bytes): 图像的二进制数据
  1680. Returns:
  1681. dict: 解析的名片信息
  1682. """
  1683. try:
  1684. # 将图片数据转为 base64 编码
  1685. base64_image = base64.b64encode(image_data).decode('utf-8')
  1686. # 初始化 OpenAI 客户端,配置为阿里云 API
  1687. client = OpenAI(
  1688. api_key=QWEN_API_KEY,
  1689. base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
  1690. )
  1691. # 构建优化后的提示语
  1692. prompt = """你是企业名片的信息提取专家。请仔细分析提供的图片,精确提取名片信息。
  1693. ## 提取要求
  1694. - 区分中英文内容,分别提取
  1695. - 保持提取信息的原始格式(如大小写、标点)
  1696. - 对于无法识别或名片中不存在的信息,返回空字符串
  1697. - 名片中没有的信息,请不要猜测
  1698. ## 需提取的字段
  1699. 1. 中文姓名 (name_zh)
  1700. 2. 英文姓名 (name_en)
  1701. 3. 中文职位/头衔 (title_zh)
  1702. 4. 英文职位/头衔 (title_en)
  1703. 5. 中文酒店/公司名称 (hotel_zh)
  1704. 6. 英文酒店/公司名称 (hotel_en)
  1705. 7. 手机号码 (mobile) - 如有多个手机号码,使用逗号分隔,最多提取3个
  1706. 8. 固定电话 (phone) - 如有多个,使用逗号分隔
  1707. 9. 电子邮箱 (email)
  1708. 10. 中文地址 (address_zh)
  1709. 11. 英文地址 (address_en)
  1710. 12. 中文邮政编码 (postal_code_zh)
  1711. 13. 英文邮政编码 (postal_code_en)
  1712. 14. 生日 (birthday) - 格式为YYYY-MM-DD,如1990-01-01
  1713. 15. 年龄 (age) - 数字格式,如30
  1714. 16. 籍贯 (native_place) - 出生地或户籍所在地信息
  1715. 17. 居住地 (residence) - 个人居住地址信息
  1716. 18. 品牌组合 (brand_group) - 如有多个品牌,使用逗号分隔
  1717. 19. 职业轨迹 (career_path) - 如能从名片中推断,以JSON数组格式返回,包含当前日期,公司名称和职位。自动生成当前日期。
  1718. 20. 隶属关系 (affiliation) - 如能从名片中推断,以JSON数组格式返回,包含公司名称和隶属集团名称
  1719. ## 输出格式
  1720. 请以严格的JSON格式返回结果,不要添加任何额外解释文字。JSON格式如下:
  1721. ```json
  1722. {
  1723. "name_zh": "",
  1724. "name_en": "",
  1725. "title_zh": "",
  1726. "title_en": "",
  1727. "hotel_zh": "",
  1728. "hotel_en": "",
  1729. "mobile": "",
  1730. "phone": "",
  1731. "email": "",
  1732. "address_zh": "",
  1733. "address_en": "",
  1734. "postal_code_zh": "",
  1735. "postal_code_en": "",
  1736. "birthday": "",
  1737. "age": 0,
  1738. "native_place": "",
  1739. "residence": "",
  1740. "brand_group": "",
  1741. "career_path": [],
  1742. "affiliation": []
  1743. }
  1744. ```"""
  1745. # 调用 Qwen VL Max API
  1746. logging.info("发送请求到 Qwen VL Max 模型")
  1747. completion = client.chat.completions.create(
  1748. # model="qwen-vl-plus",
  1749. model="qwen-vl-max-latest",
  1750. messages=[
  1751. {
  1752. "role": "user",
  1753. "content": [
  1754. {"type": "text", "text": prompt},
  1755. {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
  1756. ]
  1757. }
  1758. ],
  1759. temperature=0.1, # 降低温度增加精确性
  1760. response_format={"type": "json_object"} # 要求输出JSON格式
  1761. )
  1762. # 解析响应
  1763. response_content = completion.choices[0].message.content
  1764. logging.info(f"成功从 Qwen 模型获取响应: {response_content}")
  1765. # 尝试从响应中提取 JSON
  1766. try:
  1767. extracted_data = json.loads(response_content)
  1768. logging.info("成功解析 Qwen 响应中的 JSON")
  1769. except json.JSONDecodeError:
  1770. logging.warning("无法解析 JSON,尝试从文本中提取信息")
  1771. # 这里可以调用其他的解析函数,但为了简化,先返回错误
  1772. raise Exception("无法解析 Qwen 返回的 JSON 格式")
  1773. # 确保所有必要字段存在
  1774. required_fields = [
  1775. 'name_zh', 'name_en', 'title_zh', 'title_en',
  1776. 'hotel_zh', 'hotel_en', 'mobile', 'phone',
  1777. 'email', 'address_zh', 'address_en',
  1778. 'postal_code_zh', 'postal_code_en', 'birthday', 'age', 'native_place', 'residence',
  1779. 'brand_group', 'career_path'
  1780. ]
  1781. for field in required_fields:
  1782. if field not in extracted_data:
  1783. if field == 'career_path':
  1784. extracted_data[field] = []
  1785. elif field == 'age':
  1786. extracted_data[field] = 0
  1787. else:
  1788. extracted_data[field] = ""
  1789. # 为career_path增加一条记录
  1790. if extracted_data.get('hotel_zh') or extracted_data.get('hotel_en') or extracted_data.get('title_zh') or extracted_data.get('title_en'):
  1791. career_entry = {
  1792. 'date': datetime.now().strftime('%Y-%m-%d'),
  1793. 'hotel_en': extracted_data.get('hotel_en', ''),
  1794. 'hotel_zh': extracted_data.get('hotel_zh', ''),
  1795. 'image_path': '',
  1796. 'source': 'business_card_creation',
  1797. 'title_en': extracted_data.get('title_en', ''),
  1798. 'title_zh': extracted_data.get('title_zh', '')
  1799. }
  1800. # 直接清空原有的career_path内容,用career_entry写入
  1801. extracted_data['career_path'] = [career_entry]
  1802. logging.info(f"为解析结果设置了career_path记录: {career_entry}")
  1803. return extracted_data
  1804. except Exception as e:
  1805. error_msg = f"Qwen VL Max 模型解析失败: {str(e)}"
  1806. logging.error(error_msg, exc_info=True)
  1807. raise Exception(error_msg)