Spaces:
Runtime error
Runtime error
| from my_config import MY_CONFIG | |
| import os | |
| import glob | |
| from llama_index.core import SimpleDirectoryReader | |
| from llama_index.core.node_parser import SentenceSplitter | |
| from llama_index.embeddings.huggingface import HuggingFaceEmbedding | |
| from llama_index.core import Settings | |
| from pymilvus import MilvusClient | |
| from llama_index.core import StorageContext | |
| from llama_index.vector_stores.milvus import MilvusVectorStore | |
| from llama_index.core import VectorStoreIndex | |
| import logging | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
| logger = logging.getLogger(__name__) | |
| # logger.setLevel(logging.INFO) | |
| # Step-1: Read Markdown files | |
| pattern = os.path.join(MY_CONFIG.PROCESSED_DATA_DIR, '*.md') | |
| md_file_count = len(glob.glob(pattern, recursive=True)) | |
| reader = SimpleDirectoryReader(input_dir=MY_CONFIG.PROCESSED_DATA_DIR, recursive=False, required_exts=[".md"]) | |
| documents = reader.load_data() | |
| logger.info (f"Loaded {len(documents)} documents from {md_file_count} files") | |
| # Step-2: Create Chunks | |
| parser = SentenceSplitter(chunk_size=MY_CONFIG.CHUNK_SIZE, chunk_overlap=MY_CONFIG.CHUNK_OVERLAP) | |
| nodes = parser.get_nodes_from_documents(documents) | |
| logger.info (f"Created {len(nodes)} chunks from {len(documents)} documents") | |
| # Step-3: Setup Embedding Model | |
| os.environ['HF_ENDPOINT'] = MY_CONFIG.HF_ENDPOINT | |
| Settings.embed_model = HuggingFaceEmbedding( | |
| model_name = MY_CONFIG.EMBEDDING_MODEL | |
| ) | |
| # Step-4: Create 2 Vector Databases (Vector RAG and Hybrid GraphRAG databases) | |
| databases_to_create = [ | |
| { | |
| "name": "Vector RAG Only", | |
| "uri": MY_CONFIG.MILVUS_URI_VECTOR, | |
| "description": "For Vector RAG systems" | |
| }, | |
| { | |
| "name": "Hybrid GraphRAG", | |
| "uri": MY_CONFIG.MILVUS_URI_HYBRID_GRAPH, | |
| "description": "For Hybrid GraphRAG systems" | |
| } | |
| ] | |
| for db_config in databases_to_create: | |
| logger.info(f"π¦ Creating {db_config['name']} database...") | |
| # Connect to Milvus for this database | |
| milvus_client = MilvusClient(db_config['uri']) | |
| logger.info(f"β Connected to: {db_config['uri']}") | |
| if milvus_client.has_collection(collection_name = MY_CONFIG.COLLECTION_NAME): | |
| milvus_client.drop_collection(collection_name = MY_CONFIG.COLLECTION_NAME) | |
| logger.info(f"β Cleared collection: {MY_CONFIG.COLLECTION_NAME}") | |
| # Connect llama-index to vector db | |
| vector_store = MilvusVectorStore( | |
| uri = db_config['uri'], | |
| dim = MY_CONFIG.EMBEDDING_LENGTH, | |
| collection_name = MY_CONFIG.COLLECTION_NAME, | |
| overwrite=True | |
| ) | |
| storage_context = StorageContext.from_defaults(vector_store=vector_store) | |
| # Save chunks into vector db | |
| index = VectorStoreIndex( | |
| nodes=nodes, | |
| storage_context=storage_context, | |
| ) | |
| logger.info(f"β Stored {len(nodes)} chunks in {db_config['name']}") | |
| milvus_client.close() | |
| logger.info("π Both databases created!") | |
| logger.info(f" β’ Vector RAG: {MY_CONFIG.MILVUS_URI_VECTOR}") | |
| logger.info(f" β’ Hybrid GraphRAG: {MY_CONFIG.MILVUS_URI_HYBRID_GRAPH}") |