1234567891011121314151617181920212223 |
- import chromadb
- from embedding.embedding import get_embedding
- from langchain.text_splitter import RecursiveCharacterTextSplitter
- from database.create_db import load_txt_from_dir
- DEFAULT_DB_PATH = r"../knowledge_db"
- DEFAULT_PERSIST_PATH = '../vector_db/chroma'
- # 创建一个客户端
- chroma_client = chromadb.Client()
- # 创建一个集合/传统数据库中的一张表
- embeddings = get_embedding(embedding='m3e')
- collection = chroma_client.get_or_create_collection(name="my_collection",embedding_function=embeddings)
- all_docs = load_txt_from_dir(DEFAULT_DB_PATH)
- # 切分文档
- text_splitter = RecursiveCharacterTextSplitter(
- chunk_size=500, chunk_overlap=150)
- split_docs = text_splitter.split_documents(all_docs)
|