allycat / 3_save_to_vector_db.py
niloydebbarma's picture
Upload 50 files
a7d2416 verified
from my_config import MY_CONFIG
import os
import glob
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
from pymilvus import MilvusClient
from llama_index.core import StorageContext
from llama_index.vector_stores.milvus import MilvusVectorStore
from llama_index.core import VectorStoreIndex
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# logger.setLevel(logging.INFO)
# Step-1: Read Markdown files
pattern = os.path.join(MY_CONFIG.PROCESSED_DATA_DIR, '*.md')
md_file_count = len(glob.glob(pattern, recursive=True))
reader = SimpleDirectoryReader(input_dir=MY_CONFIG.PROCESSED_DATA_DIR, recursive=False, required_exts=[".md"])
documents = reader.load_data()
logger.info (f"Loaded {len(documents)} documents from {md_file_count} files")
# Step-2: Create Chunks
parser = SentenceSplitter(chunk_size=MY_CONFIG.CHUNK_SIZE, chunk_overlap=MY_CONFIG.CHUNK_OVERLAP)
nodes = parser.get_nodes_from_documents(documents)
logger.info (f"Created {len(nodes)} chunks from {len(documents)} documents")
# Step-3: Setup Embedding Model
os.environ['HF_ENDPOINT'] = MY_CONFIG.HF_ENDPOINT
Settings.embed_model = HuggingFaceEmbedding(
model_name = MY_CONFIG.EMBEDDING_MODEL
)
# Step-4: Create 2 Vector Databases (Vector RAG and Hybrid GraphRAG databases)
databases_to_create = [
{
"name": "Vector RAG Only",
"uri": MY_CONFIG.MILVUS_URI_VECTOR,
"description": "For Vector RAG systems"
},
{
"name": "Hybrid GraphRAG",
"uri": MY_CONFIG.MILVUS_URI_HYBRID_GRAPH,
"description": "For Hybrid GraphRAG systems"
}
]
for db_config in databases_to_create:
logger.info(f"πŸ“¦ Creating {db_config['name']} database...")
# Connect to Milvus for this database
milvus_client = MilvusClient(db_config['uri'])
logger.info(f"βœ… Connected to: {db_config['uri']}")
if milvus_client.has_collection(collection_name = MY_CONFIG.COLLECTION_NAME):
milvus_client.drop_collection(collection_name = MY_CONFIG.COLLECTION_NAME)
logger.info(f"βœ… Cleared collection: {MY_CONFIG.COLLECTION_NAME}")
# Connect llama-index to vector db
vector_store = MilvusVectorStore(
uri = db_config['uri'],
dim = MY_CONFIG.EMBEDDING_LENGTH,
collection_name = MY_CONFIG.COLLECTION_NAME,
overwrite=True
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
# Save chunks into vector db
index = VectorStoreIndex(
nodes=nodes,
storage_context=storage_context,
)
logger.info(f"βœ… Stored {len(nodes)} chunks in {db_config['name']}")
milvus_client.close()
logger.info("πŸŽ‰ Both databases created!")
logger.info(f" β€’ Vector RAG: {MY_CONFIG.MILVUS_URI_VECTOR}")
logger.info(f" β€’ Hybrid GraphRAG: {MY_CONFIG.MILVUS_URI_HYBRID_GRAPH}")