feat: basic function for management of people

- recognize people info by input text or image - create a people and save into relational database and vector database and object storage - delete a people by people id - get peoples with pagination and fitlers by gender, age and height - get peoples with topN and searching by nature language
2025-10-09 22:07:03 +08:00
parent 7cea2eb8a5
commit 52d1bc5cf4
19 changed files with 1327 additions and 0 deletions
--- a/src/utils/vsdb.py
+++ b/src/utils/vsdb.py
@@ -0,0 +1,245 @@
+import uuid
+import chromadb
+import logging
+from typing import Protocol
+from chromadb.config import Settings
+from chromadb.utils import embedding_functions
+from .config import get_instance as get_config
+
+class VectorDB(Protocol):
+    
+    def insert(self, metadatas: list[dict], documents: list[str], ids: list[str] = None) -> list[str]:
+        """
+        插入向量到数据库
+        
+        Args:
+            vector (list[float]): 向量
+            metadata (dict): 元数据
+        
+        Returns:
+            bool: 是否插入成功
+        """
+        ...
+
+    def delete(self, ids: list[str]) -> bool:
+        """
+        Delete documents from a collection.
+        
+        Args:
+            ids: List of IDs to delete
+        
+        Returns:
+            bool: Whether deletion was successful
+        """
+        ...
+
+    def query(self, metadatas: dict, ids: list[str], top_k: int = 5) -> list[dict]:
+        """
+        查询向量数据库
+        
+        Args:
+            query_vector (list[float]): 查询向量
+            top_k (int, optional): 返回Top K结果. Defaults to 5.
+        
+        Returns:
+            list[dict]: 查询结果列表
+        """
+        ...
+    
+    def search(self, document: str, metadatas: dict, ids: list[str] = None, top_k: int = 5) -> list[dict]:
+        """
+        搜索向量数据库
+        
+        Args:
+            document: Document to search
+            metadatas: Metadata to filter by
+            ids: List of IDs to filter by
+            top_k (int, optional): 返回Top K结果. Defaults to 5.
+        
+        Returns:
+            list[dict]: 查询结果列表
+        """
+        ...
+        
+class ChromaDB:
+    def __init__(self, **kwargs):
+        """
+        Initialize the ChromaDB instance.
+        
+        Args:
+            persist_directory: Optional directory to persist the database.
+                              If None, the database will be in-memory only.
+        """
+        config = get_config()
+        self.embedding_functions = embedding_functions.OpenAIEmbeddingFunction(
+            api_base=config.get("voc-engine_embedding", "api_url"),
+            api_key=config.get("voc-engine_embedding", "api_key"),
+            model_name=config.get("voc-engine_embedding", "endpoint"),
+        )
+        persist_directory = config.get("chroma_vsdb", "database_dir", fallback=None)
+        if persist_directory:
+            self.client = chromadb.PersistentClient(
+                path=persist_directory,
+                settings=Settings(anonymized_telemetry=False)
+            )
+        else:
+            self.client = chromadb.Client(
+                settings=Settings(anonymized_telemetry=False),
+            )
+        self.collection_name = config.get("chroma_vsdb", "collection_name", fallback="peoples")
+        metadata: dict = kwargs.get('collection_metadata', {'hnsw:space': 'cosine'})
+        metadata['hnsw:space'] = metadata.get('hnsw:space', 'cosine')
+        self.collection = self.client.get_or_create_collection(
+            name=self.collection_name,
+            embedding_function=self.embedding_functions,
+            metadata=metadata,
+        )
+        
+    def insert(self, metadatas: list[dict], documents: list[str], ids: list[str] = None) -> list[str]:
+        """
+        Insert documents into a collection.
+        
+        Args:
+            metadatas: List of metadata corresponding to each document
+            documents: List of documents to insert
+            ids: Optional list of unique IDs for each document. If None, IDs will be generated.
+        
+        Returns:
+            list[str]: List of inserted IDs
+        """
+        
+        if not ids:
+            # Generate unique IDs if not provided
+            ids = [str(uuid.uuid4()) for _ in range(len(documents))]
+        
+        self.collection.add(
+            documents=documents,
+            metadatas=metadatas,
+            ids=ids
+        )
+        
+        return ids
+    
+    def delete(self, ids: list[str]) -> bool:
+        """
+        Delete documents from a collection.
+        
+        Args:
+            ids: List of IDs to delete
+        
+        Returns:
+            bool: Whether deletion was successful
+        """
+        try:
+            self.collection.delete(ids)
+            return True
+        except Exception as e:
+            print(f"Error deleting documents: {e}")
+            return False
+    
+    def query(self, metadatas: dict, ids: list[str] = None, top_k: int = 5) -> list[dict]:
+        """
+        查询向量数据库
+        
+        Args:
+            metadatas: Metadata to filter by
+            ids: List of IDs to query
+            top_k (int, optional): 返回Top K结果. Defaults to 5.
+        
+        Returns:
+            list[dict]: 查询结果列表
+        """
+        results = self.collection.query(
+            query_embeddings=None,
+            query_texts=None,
+            n_results=top_k,
+            where=metadatas,
+            ids=ids,
+            include=["documents", "metadatas", "distances"],
+        )
+        formatted_results = []
+        for i in range(len(results['ids'][0])):
+            result = {
+                'id': results['ids'][0][i],
+                'distance': results['distances'][0][i],
+                'metadata': results['metadatas'][0][i] if results['metadatas'][0] else {},
+                'document': results['documents'][0][i] if results['documents'][0] else ''
+            }
+            formatted_results.append(result)
+        return formatted_results
+    
+    def search(self, document: str, metadatas: dict, ids: list[str] = None, top_k: int = 5) -> list[dict]:
+        """
+        搜索向量数据库
+        
+        Args:
+            document: Document to search
+            metadatas: Metadata to filter by
+            ids: List of IDs to filter by
+            top_k (int, optional): 返回Top K结果. Defaults to 5.
+        
+        Returns:
+            list[dict]: 查询结果列表
+        """
+        results = self.collection.query(
+            query_embeddings=None,
+            query_texts=[document],
+            n_results=top_k,
+            where=metadatas if metadatas else None,
+            ids=ids,
+            include=["documents", "metadatas", "distances"],
+        )
+        print("log: results keys: ", results.keys())
+        print("log: results ids: ", results['ids'])
+        formatted_results = []
+        for i in range(len(results['ids'][0])):
+            result = {
+                'id': results['ids'][0][i],
+                'distance': results['distances'][0][i],
+                'metadata': results['metadatas'][0][i] if results['metadatas'][0] else {},
+                'document': results['documents'][0][i] if results['documents'][0] else ''
+            }
+            formatted_results.append(result)
+        return formatted_results
+    pass
+
+_vsdb_instance: VectorDB = None
+
+def init():
+    global _vsdb_instance
+    _vsdb_instance = ChromaDB()
+
+def get_instance() -> VectorDB:
+    global _vsdb_instance
+    return _vsdb_instance
+
+
+if __name__ == "__main__":
+    import os
+    
+    from logger import init as init_logger
+    init_logger(log_dir="logs", log_file="test", log_level=logging.INFO, console_log_level=logging.DEBUG)
+    
+    from config import init as init_config
+    config_file = os.path.join(os.path.dirname(__file__), "../../configuration/test_conf.ini")
+    init_config(config_file)
+    
+    init()
+    vsdb = get_instance()
+    metadatas = [
+        {'name': '丽丽'},
+        {'name': '志刚'},
+        {'name': '张三'},
+        {'name': '李四'},
+    ]
+    documents = [
+        '姓名: 丽丽, 性别: 女, 年龄: 23, 爱好: 爬山、骑行、攀岩、跳伞、蹦极',
+        "姓名: 志刚, 性别: 男, 年龄: 25, 爱好: 读书、游戏",
+        "姓名: 张三, 性别: 男, 年龄: 30, 爱好: 画画、写作、阅读、逛展、旅行",
+        "姓名: 李四, 性别: 男, 年龄: 35, 爱好: 做饭、美食、旅游"
+    ]
+    search_text = '25岁以下的'
+    ids = vsdb.insert(metadatas, documents)
+    results = vsdb.search(search_text, None, None, top_k=4)
+    for result in results:
+        print(result['document'], ' ', result['distance'])