import chromadb from embedding.embedding import get_embedding from langchain.text_splitter import RecursiveCharacterTextSplitter from database.create_db import load_txt_from_dir DEFAULT_DB_PATH = r"../knowledge_db" DEFAULT_PERSIST_PATH = '../vector_db/chroma' # 创建一个客户端 chroma_client = chromadb.Client() # 创建一个集合/传统数据库中的一张表 embeddings = get_embedding(embedding='m3e') collection = chroma_client.get_or_create_collection(name="my_collection",embedding_function=embeddings) all_docs = load_txt_from_dir(DEFAULT_DB_PATH) # 切分文档 text_splitter = RecursiveCharacterTextSplitter( chunk_size=500, chunk_overlap=150) split_docs = text_splitter.split_documents(all_docs)