Spaces:
Runtime error
Runtime error
Upload 50 files
Browse files- .env.cloud.sample +122 -0
- .env.hybrid.sample +126 -0
- .env.local.sample +125 -0
- .gitignore +26 -0
- .python-version +1 -0
- 1_crawl_site.py +188 -0
- 2_process_files.ipynb +135 -0
- 2_process_files.py +141 -0
- 2b_process_graph_phase1.py +881 -0
- 2b_process_graph_phase2.py +427 -0
- 2b_process_graph_phase3.py +1096 -0
- 3_save_to_vector_db.ipynb +329 -0
- 3_save_to_vector_db.py +85 -0
- 3_save_to_vector_db_zilliz.py +106 -0
- 3b_save_to_graph_db.py +1050 -0
- 4_query copy.py +173 -0
- 4_query.ipynb +398 -0
- 4_query.py +194 -0
- 4b_query_graph copy.py +338 -0
- 4b_query_graph.py +327 -0
- CHANGELOG.md +208 -0
- Dockerfile +66 -0
- Dockerfile-dev +65 -0
- LICENSE +201 -0
- README.md +75 -10
- app_chainlit.py +299 -0
- app_chainlit_graph.py +375 -0
- app_flask.py +189 -0
- app_flask_graph.py +264 -0
- chainlit.md +14 -0
- cleanup_pipeline_deps.sh +55 -0
- docker-compose.cloud.yml +34 -0
- docker-compose.hybrid.yml +36 -0
- docker-compose.local.yml +38 -0
- docker-startup.sh +206 -0
- env.sample.txt +177 -0
- file_utils.py +56 -0
- litellm_patch.py +106 -0
- my_config.py +131 -0
- news.md +51 -0
- package-lock.json +523 -0
- package.json +6 -0
- pyproject.toml +28 -0
- query_utils.py +12 -0
- requirements-build.txt +35 -0
- requirements-docker-cloud.txt +49 -0
- requirements-docker.txt +51 -0
- requirements-runtime.txt +59 -0
- requirements.txt +51 -0
- uv.lock +0 -0
.env.cloud.sample
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ============================================
|
| 2 |
+
# AllyCAT GraphRAG - Cloud Configuration
|
| 3 |
+
# ============================================
|
| 4 |
+
# This configuration uses cloud services for all components
|
| 5 |
+
# Recommended for production and free-tier deployments
|
| 6 |
+
# Docker image size: ~800 MB
|
| 7 |
+
|
| 8 |
+
# ============================================
|
| 9 |
+
# Pipeline Automation (Docker Only)
|
| 10 |
+
# ============================================
|
| 11 |
+
# Set to 'true' to automatically run the complete pipeline on container startup
|
| 12 |
+
# This will: crawl → process → save to vector DB → process graph → save to graph DB
|
| 13 |
+
# Recommended for cloud deployments (Heroku, AWS, Google Cloud Run)
|
| 14 |
+
AUTO_RUN_PIPELINE=true
|
| 15 |
+
|
| 16 |
+
# Website to crawl (required if AUTO_RUN_PIPELINE=true)
|
| 17 |
+
WEBSITE_URL=https://your-website.com
|
| 18 |
+
|
| 19 |
+
# Memory Optimization: Remove pipeline dependencies after completion
|
| 20 |
+
# Saves ~350-500 MB RAM - Highly recommended for 1GB containers
|
| 21 |
+
# Enables deployment on cheaper plans: DigitalOcean $12/mo (1GB) vs $25/mo (2GB)
|
| 22 |
+
CLEANUP_PIPELINE_DEPS=true
|
| 23 |
+
|
| 24 |
+
# ============================================
|
| 25 |
+
# LLM Configuration - Cloud Mode
|
| 26 |
+
# ============================================
|
| 27 |
+
LLM_RUN_ENV=cloud
|
| 28 |
+
# Choose your preferred cloud LLM (via LiteLLM)
|
| 29 |
+
LLM_MODEL=cerebras/llama3.1-8b
|
| 30 |
+
# Alternative models:
|
| 31 |
+
# LLM_MODEL=gemini/gemini-1.5-flash
|
| 32 |
+
|
| 33 |
+
# ============================================
|
| 34 |
+
# LLM API Keys (Set at least one)
|
| 35 |
+
# ============================================
|
| 36 |
+
# Cerebras (Fast, free tier available)
|
| 37 |
+
CEREBRAS_API_KEY=your_cerebras_api_key_here
|
| 38 |
+
|
| 39 |
+
# Google Gemini (Good for graph extraction)
|
| 40 |
+
GEMINI_API_KEY=your_gemini_api_key_here
|
| 41 |
+
|
| 42 |
+
# Nebius (Alternative provider)
|
| 43 |
+
NEBIUS_API_KEY=your_nebius_api_key_here
|
| 44 |
+
|
| 45 |
+
# ============================================
|
| 46 |
+
# Vector Database - Zilliz Cloud
|
| 47 |
+
# ============================================
|
| 48 |
+
VECTOR_DB_TYPE=cloud_zilliz
|
| 49 |
+
ZILLIZ_CLUSTER_ENDPOINT=https://your-cluster.zilliz.cloud
|
| 50 |
+
ZILLIZ_TOKEN=your_zilliz_token_here
|
| 51 |
+
|
| 52 |
+
# ============================================
|
| 53 |
+
# Graph Database - Neo4j Aura Cloud
|
| 54 |
+
# ============================================
|
| 55 |
+
NEO4J_URI=neo4j+s://your-instance.databases.neo4j.io
|
| 56 |
+
NEO4J_USERNAME=neo4j
|
| 57 |
+
NEO4J_PASSWORD=your_neo4j_password_here
|
| 58 |
+
NEO4J_DATABASE=neo4j
|
| 59 |
+
|
| 60 |
+
# ============================================
|
| 61 |
+
# Application Settings
|
| 62 |
+
# ============================================
|
| 63 |
+
# Choose app type: flask_graph, chainlit_graph, flask
|
| 64 |
+
APP_TYPE=flask_graph
|
| 65 |
+
|
| 66 |
+
# ============================================
|
| 67 |
+
# Port Configuration
|
| 68 |
+
# ============================================
|
| 69 |
+
# Flask Applications
|
| 70 |
+
FLASK_VECTOR_PORT=8081 # app_flask.py (vector-only RAG)
|
| 71 |
+
FLASK_GRAPH_PORT=8080 # app_flask_graph.py (GraphRAG - default)
|
| 72 |
+
|
| 73 |
+
# Chainlit Applications
|
| 74 |
+
CHAINLIT_VECTOR_PORT=8082 # app_chainlit.py
|
| 75 |
+
CHAINLIT_GRAPH_PORT=8083 # app_chainlit_graph.py
|
| 76 |
+
|
| 77 |
+
# Docker & External Services
|
| 78 |
+
DOCKER_PORT=8080 # External Docker exposed port (host side)
|
| 79 |
+
DOCKER_APP_PORT=8080 # Internal container port (container side, set to match your APP_TYPE)
|
| 80 |
+
OLLAMA_PORT=11434 # Ollama server port (not used in cloud mode)
|
| 81 |
+
|
| 82 |
+
# Workspace directory
|
| 83 |
+
# For native execution: use relative path 'workspace'
|
| 84 |
+
# For Docker: use absolute path '/allycat/workspace'
|
| 85 |
+
WORKSPACE_DIR=/allycat/workspace
|
| 86 |
+
|
| 87 |
+
# ============================================
|
| 88 |
+
# Website Crawling Configuration
|
| 89 |
+
# ============================================
|
| 90 |
+
WEBSITE_URL=https://example.com
|
| 91 |
+
CRAWL_MAX_DOWNLOADS=100
|
| 92 |
+
CRAWL_MAX_DEPTH=3
|
| 93 |
+
WAITTIME_BETWEEN_REQUESTS=0.1
|
| 94 |
+
|
| 95 |
+
# ============================================
|
| 96 |
+
# Embedding Model Configuration
|
| 97 |
+
# ============================================
|
| 98 |
+
EMBEDDING_MODEL=ibm-granite/granite-embedding-30m-english
|
| 99 |
+
EMBEDDING_LENGTH=384
|
| 100 |
+
HF_ENDPOINT=https://hf-mirror.com
|
| 101 |
+
|
| 102 |
+
# ============================================
|
| 103 |
+
# Chunking Configuration
|
| 104 |
+
# ============================================
|
| 105 |
+
CHUNK_SIZE=512
|
| 106 |
+
CHUNK_OVERLAP=20
|
| 107 |
+
|
| 108 |
+
# ============================================
|
| 109 |
+
# Graph Extraction Configuration
|
| 110 |
+
# ============================================
|
| 111 |
+
GRAPH_MIN_ENTITIES=5
|
| 112 |
+
GRAPH_MAX_ENTITIES=15
|
| 113 |
+
GRAPH_MIN_RELATIONSHIPS=3
|
| 114 |
+
GRAPH_MAX_RELATIONSHIPS=8
|
| 115 |
+
GRAPH_MIN_CONFIDENCE=0.8
|
| 116 |
+
GRAPH_MAX_CONTENT_CHARS=12000
|
| 117 |
+
GRAPH_SENTENCE_BOUNDARY_RATIO=0.7
|
| 118 |
+
|
| 119 |
+
# ============================================
|
| 120 |
+
# UI Settings
|
| 121 |
+
# ============================================
|
| 122 |
+
UI_STARTER_PROMPTS=What is this website? | What are upcoming events? | Who are the partners?
|
.env.hybrid.sample
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ============================================
|
| 2 |
+
# AllyCAT GraphRAG - Hybrid Configuration
|
| 3 |
+
# ============================================
|
| 4 |
+
# This configuration uses cloud LLMs with local vector database
|
| 5 |
+
# Recommended for: Privacy-focused deployments with cloud AI benefits
|
| 6 |
+
# Docker image size: ~1.5 GB
|
| 7 |
+
|
| 8 |
+
# ============================================
|
| 9 |
+
# Pipeline Automation (Docker Only)
|
| 10 |
+
# ============================================
|
| 11 |
+
# Set to 'true' to automatically run the complete pipeline on container startup
|
| 12 |
+
# This will: crawl → process → save to vector DB → process graph → save to graph DB
|
| 13 |
+
AUTO_RUN_PIPELINE=false
|
| 14 |
+
|
| 15 |
+
# Website to crawl (required if AUTO_RUN_PIPELINE=true)
|
| 16 |
+
WEBSITE_URL=https://your-website.com
|
| 17 |
+
|
| 18 |
+
# Memory Optimization: Remove pipeline dependencies after completion
|
| 19 |
+
# Saves ~350-500 MB RAM - Useful for hybrid deployments on budget VPS
|
| 20 |
+
CLEANUP_PIPELINE_DEPS=false
|
| 21 |
+
|
| 22 |
+
# ============================================
|
| 23 |
+
# LLM Configuration - Cloud Mode
|
| 24 |
+
# ============================================
|
| 25 |
+
LLM_RUN_ENV=cloud
|
| 26 |
+
# Choose your preferred cloud LLM (via LiteLLM)
|
| 27 |
+
LLM_MODEL=cerebras/llama3.1-8b
|
| 28 |
+
# Alternative models:
|
| 29 |
+
# LLM_MODEL=gemini/gemini-1.5-flash
|
| 30 |
+
|
| 31 |
+
# ============================================
|
| 32 |
+
# LLM API Keys (Set at least one)
|
| 33 |
+
# ============================================
|
| 34 |
+
# Cerebras (Fast, free tier available)
|
| 35 |
+
CEREBRAS_API_KEY=your_cerebras_api_key_here
|
| 36 |
+
|
| 37 |
+
# Google Gemini (Good for graph extraction)
|
| 38 |
+
GEMINI_API_KEY=your_gemini_api_key_here
|
| 39 |
+
|
| 40 |
+
# Nebius (Alternative provider)
|
| 41 |
+
NEBIUS_API_KEY=your_nebius_api_key_here
|
| 42 |
+
|
| 43 |
+
# ============================================
|
| 44 |
+
# Vector Database - Local Milvus
|
| 45 |
+
# ============================================
|
| 46 |
+
VECTOR_DB_TYPE=local
|
| 47 |
+
# Local database files stored in workspace
|
| 48 |
+
|
| 49 |
+
# ============================================
|
| 50 |
+
# Graph Database - Neo4j (Cloud or Local)
|
| 51 |
+
# ============================================
|
| 52 |
+
# Option 1: Neo4j Aura Cloud (Recommended)
|
| 53 |
+
NEO4J_URI=neo4j+s://your-instance.databases.neo4j.io
|
| 54 |
+
NEO4J_USERNAME=neo4j
|
| 55 |
+
NEO4J_PASSWORD=your_neo4j_password_here
|
| 56 |
+
NEO4J_DATABASE=neo4j
|
| 57 |
+
|
| 58 |
+
# Option 2: Local Neo4j
|
| 59 |
+
# NEO4J_URI=bolt://localhost:7687
|
| 60 |
+
# NEO4J_USERNAME=neo4j
|
| 61 |
+
# NEO4J_PASSWORD=your_local_password
|
| 62 |
+
# NEO4J_DATABASE=neo4j
|
| 63 |
+
|
| 64 |
+
# ============================================
|
| 65 |
+
# Application Settings
|
| 66 |
+
# ============================================
|
| 67 |
+
# Choose app type: flask_graph, chainlit_graph, flask, chainlit
|
| 68 |
+
APP_TYPE=flask_graph
|
| 69 |
+
|
| 70 |
+
# ============================================
|
| 71 |
+
# Port Configuration
|
| 72 |
+
# ============================================
|
| 73 |
+
# Flask Applications
|
| 74 |
+
FLASK_VECTOR_PORT=8081 # app_flask.py (vector-only RAG)
|
| 75 |
+
FLASK_GRAPH_PORT=8080 # app_flask_graph.py (GraphRAG - default)
|
| 76 |
+
|
| 77 |
+
# Chainlit Applications
|
| 78 |
+
CHAINLIT_VECTOR_PORT=8082 # app_chainlit.py
|
| 79 |
+
CHAINLIT_GRAPH_PORT=8083 # app_chainlit_graph.py
|
| 80 |
+
|
| 81 |
+
# Docker & External Services
|
| 82 |
+
DOCKER_PORT=8080 # External Docker exposed port (host side)
|
| 83 |
+
DOCKER_APP_PORT=8080 # Internal container port (container side, set to match your APP_TYPE)
|
| 84 |
+
OLLAMA_PORT=11434 # Ollama server port (not used in hybrid mode)
|
| 85 |
+
|
| 86 |
+
# Workspace directory
|
| 87 |
+
# For native execution: use relative path 'workspace'
|
| 88 |
+
# For Docker: use absolute path '/allycat/workspace'
|
| 89 |
+
WORKSPACE_DIR=/allycat/workspace
|
| 90 |
+
|
| 91 |
+
# ============================================
|
| 92 |
+
# Website Crawling Configuration
|
| 93 |
+
# ============================================
|
| 94 |
+
WEBSITE_URL=https://example.com
|
| 95 |
+
CRAWL_MAX_DOWNLOADS=100
|
| 96 |
+
CRAWL_MAX_DEPTH=3
|
| 97 |
+
WAITTIME_BETWEEN_REQUESTS=0.1
|
| 98 |
+
|
| 99 |
+
# ============================================
|
| 100 |
+
# Embedding Model Configuration
|
| 101 |
+
# ============================================
|
| 102 |
+
EMBEDDING_MODEL=ibm-granite/granite-embedding-30m-english
|
| 103 |
+
EMBEDDING_LENGTH=384
|
| 104 |
+
HF_ENDPOINT=https://hf-mirror.com
|
| 105 |
+
|
| 106 |
+
# ============================================
|
| 107 |
+
# Chunking Configuration
|
| 108 |
+
# ============================================
|
| 109 |
+
CHUNK_SIZE=512
|
| 110 |
+
CHUNK_OVERLAP=20
|
| 111 |
+
|
| 112 |
+
# ============================================
|
| 113 |
+
# Graph Extraction Configuration
|
| 114 |
+
# ============================================
|
| 115 |
+
GRAPH_MIN_ENTITIES=5
|
| 116 |
+
GRAPH_MAX_ENTITIES=15
|
| 117 |
+
GRAPH_MIN_RELATIONSHIPS=3
|
| 118 |
+
GRAPH_MAX_RELATIONSHIPS=8
|
| 119 |
+
GRAPH_MIN_CONFIDENCE=0.8
|
| 120 |
+
GRAPH_MAX_CONTENT_CHARS=12000
|
| 121 |
+
GRAPH_SENTENCE_BOUNDARY_RATIO=0.7
|
| 122 |
+
|
| 123 |
+
# ============================================
|
| 124 |
+
# UI Settings
|
| 125 |
+
# ============================================
|
| 126 |
+
UI_STARTER_PROMPTS=What is this website? | What are upcoming events? | Who are the partners?
|
.env.local.sample
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ============================================
|
| 2 |
+
# AllyCAT GraphRAG - Local Configuration
|
| 3 |
+
# ============================================
|
| 4 |
+
# This configuration uses local services for all components
|
| 5 |
+
# Recommended for development, testing, and offline deployments
|
| 6 |
+
# Docker image size: ~4+ GB (includes Ollama)
|
| 7 |
+
|
| 8 |
+
# ============================================
|
| 9 |
+
# Pipeline Automation (Docker Only)
|
| 10 |
+
# ============================================
|
| 11 |
+
# Set to 'true' to automatically run the complete pipeline on container startup
|
| 12 |
+
# This will: crawl → process → save to vector DB → process graph → save to graph DB
|
| 13 |
+
# For local development, typically set to 'false' to run steps manually
|
| 14 |
+
AUTO_RUN_PIPELINE=false
|
| 15 |
+
|
| 16 |
+
# Website to crawl (required if AUTO_RUN_PIPELINE=true)
|
| 17 |
+
WEBSITE_URL=https://your-website.com
|
| 18 |
+
|
| 19 |
+
# Memory Optimization: Remove pipeline dependencies after completion
|
| 20 |
+
# Saves ~350-500 MB RAM (less critical for local development)
|
| 21 |
+
# Set to true if running in resource-constrained environments
|
| 22 |
+
CLEANUP_PIPELINE_DEPS=false
|
| 23 |
+
|
| 24 |
+
# ============================================
|
| 25 |
+
# LLM Configuration - Local Ollama
|
| 26 |
+
# ============================================
|
| 27 |
+
LLM_RUN_ENV=local_ollama
|
| 28 |
+
LLM_MODEL=ollama/gemma3:1b
|
| 29 |
+
# Model to download and use
|
| 30 |
+
OLLAMA_MODEL=gemma3:1b
|
| 31 |
+
# Alternative local models:
|
| 32 |
+
# OLLAMA_MODEL=qwen2.5:1.5b
|
| 33 |
+
# OLLAMA_MODEL=llama3.2:1b
|
| 34 |
+
|
| 35 |
+
# ============================================
|
| 36 |
+
# Vector Database - Local Milvus
|
| 37 |
+
# ============================================
|
| 38 |
+
VECTOR_DB_TYPE=local
|
| 39 |
+
# Local database files stored in workspace
|
| 40 |
+
|
| 41 |
+
# ============================================
|
| 42 |
+
# Graph Database - Local or Cloud Neo4j
|
| 43 |
+
# ============================================
|
| 44 |
+
# Option 1: Local Neo4j (requires separate Neo4j installation)
|
| 45 |
+
NEO4J_URI=bolt://localhost:7687
|
| 46 |
+
NEO4J_USERNAME=neo4j
|
| 47 |
+
NEO4J_PASSWORD=your_local_password
|
| 48 |
+
NEO4J_DATABASE=neo4j
|
| 49 |
+
|
| 50 |
+
# Option 2: Or use Neo4j Aura Cloud even in local mode
|
| 51 |
+
# NEO4J_URI=neo4j+s://your-instance.databases.neo4j.io
|
| 52 |
+
# NEO4J_USERNAME=neo4j
|
| 53 |
+
# NEO4J_PASSWORD=your_neo4j_password_here
|
| 54 |
+
# NEO4J_DATABASE=neo4j
|
| 55 |
+
|
| 56 |
+
# ============================================
|
| 57 |
+
# Graph Extraction LLM (Cloud API recommended)
|
| 58 |
+
# ============================================
|
| 59 |
+
# Even in local mode, graph extraction benefits from cloud LLMs
|
| 60 |
+
# Set at least one for graph building:
|
| 61 |
+
GEMINI_API_KEY=your_gemini_api_key_here
|
| 62 |
+
CEREBRAS_API_KEY=your_cerebras_api_key_here
|
| 63 |
+
|
| 64 |
+
# ============================================
|
| 65 |
+
# Application Settings
|
| 66 |
+
# ============================================
|
| 67 |
+
APP_TYPE=flask_graph
|
| 68 |
+
|
| 69 |
+
# ============================================
|
| 70 |
+
# Port Configuration
|
| 71 |
+
# ============================================
|
| 72 |
+
# Flask Applications
|
| 73 |
+
FLASK_VECTOR_PORT=8081 # app_flask.py (vector-only RAG)
|
| 74 |
+
FLASK_GRAPH_PORT=8080 # app_flask_graph.py (GraphRAG - default)
|
| 75 |
+
|
| 76 |
+
# Chainlit Applications
|
| 77 |
+
CHAINLIT_VECTOR_PORT=8082 # app_chainlit.py
|
| 78 |
+
CHAINLIT_GRAPH_PORT=8083 # app_chainlit_graph.py
|
| 79 |
+
|
| 80 |
+
# Docker & External Services
|
| 81 |
+
DOCKER_PORT=8080 # External Docker exposed port (host side)
|
| 82 |
+
DOCKER_APP_PORT=8080 # Internal container port (container side, set to match your APP_TYPE)
|
| 83 |
+
OLLAMA_PORT=11434 # Ollama server port (for local LLM)
|
| 84 |
+
|
| 85 |
+
# Workspace directory
|
| 86 |
+
# For native execution: use relative path 'workspace'
|
| 87 |
+
# For Docker: use absolute path '/allycat/workspace'
|
| 88 |
+
WORKSPACE_DIR=/allycat/workspace
|
| 89 |
+
|
| 90 |
+
# ============================================
|
| 91 |
+
# Website Crawling Configuration
|
| 92 |
+
# ============================================
|
| 93 |
+
WEBSITE_URL=https://example.com
|
| 94 |
+
CRAWL_MAX_DOWNLOADS=100
|
| 95 |
+
CRAWL_MAX_DEPTH=3
|
| 96 |
+
WAITTIME_BETWEEN_REQUESTS=0.1
|
| 97 |
+
|
| 98 |
+
# ============================================
|
| 99 |
+
# Embedding Model Configuration
|
| 100 |
+
# ============================================
|
| 101 |
+
EMBEDDING_MODEL=ibm-granite/granite-embedding-30m-english
|
| 102 |
+
EMBEDDING_LENGTH=384
|
| 103 |
+
HF_ENDPOINT=https://hf-mirror.com
|
| 104 |
+
|
| 105 |
+
# ============================================
|
| 106 |
+
# Chunking Configuration
|
| 107 |
+
# ============================================
|
| 108 |
+
CHUNK_SIZE=512
|
| 109 |
+
CHUNK_OVERLAP=20
|
| 110 |
+
|
| 111 |
+
# ============================================
|
| 112 |
+
# Graph Extraction Configuration
|
| 113 |
+
# ============================================
|
| 114 |
+
GRAPH_MIN_ENTITIES=5
|
| 115 |
+
GRAPH_MAX_ENTITIES=15
|
| 116 |
+
GRAPH_MIN_RELATIONSHIPS=3
|
| 117 |
+
GRAPH_MAX_RELATIONSHIPS=8
|
| 118 |
+
GRAPH_MIN_CONFIDENCE=0.8
|
| 119 |
+
GRAPH_MAX_CONTENT_CHARS=12000
|
| 120 |
+
GRAPH_SENTENCE_BOUNDARY_RATIO=0.7
|
| 121 |
+
|
| 122 |
+
# ============================================
|
| 123 |
+
# UI Settings
|
| 124 |
+
# ============================================
|
| 125 |
+
UI_STARTER_PROMPTS=What is this website? | What are upcoming events? | Who are the partners?
|
.gitignore
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
workspace/
|
| 2 |
+
*.out*
|
| 3 |
+
.directory
|
| 4 |
+
venv*
|
| 5 |
+
.vscode
|
| 6 |
+
tmp
|
| 7 |
+
|
| 8 |
+
# Ignore actual .env file but allow sample files
|
| 9 |
+
.env
|
| 10 |
+
!.env.*.sample
|
| 11 |
+
!env.sample.txt
|
| 12 |
+
|
| 13 |
+
*.db
|
| 14 |
+
*.db.lock
|
| 15 |
+
|
| 16 |
+
## profiling outputs
|
| 17 |
+
*.speed
|
| 18 |
+
|
| 19 |
+
__pycache__
|
| 20 |
+
|
| 21 |
+
chainlit.md
|
| 22 |
+
|
| 23 |
+
node_modules/
|
| 24 |
+
|
| 25 |
+
logs/
|
| 26 |
+
logs/*
|
.python-version
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
3.11
|
1_crawl_site.py
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import shutil
|
| 5 |
+
import requests
|
| 6 |
+
from bs4 import BeautifulSoup
|
| 7 |
+
from urllib.parse import urljoin, urlparse
|
| 8 |
+
import time
|
| 9 |
+
import logging
|
| 10 |
+
import os
|
| 11 |
+
import re
|
| 12 |
+
import mimetypes
|
| 13 |
+
from my_config import MY_CONFIG
|
| 14 |
+
|
| 15 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class WebScraper:
|
| 20 |
+
def __init__(self, url, max_downloads, depth):
|
| 21 |
+
self.url = url
|
| 22 |
+
self.max_downloads = max_downloads
|
| 23 |
+
self.depth = depth
|
| 24 |
+
self.visited_urls = set()
|
| 25 |
+
self.downloaded_base_urls = set() # Track base URLs without fragments
|
| 26 |
+
self.downloaded_count = 0
|
| 27 |
+
|
| 28 |
+
def scrape_page(self, url, current_depth=0):
|
| 29 |
+
try:
|
| 30 |
+
# For downloading, we need to remove fragment since HTTP requests ignore them
|
| 31 |
+
parsed_url = urlparse(url)
|
| 32 |
+
download_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
|
| 33 |
+
if parsed_url.query:
|
| 34 |
+
download_url += f"?{parsed_url.query}"
|
| 35 |
+
|
| 36 |
+
# Check if we've already downloaded this base URL content
|
| 37 |
+
if download_url in self.downloaded_base_urls:
|
| 38 |
+
# If we have a fragment and haven't visited this exact URL, save with fragment name
|
| 39 |
+
if parsed_url.fragment and url not in self.visited_urls:
|
| 40 |
+
# Get the cached response if we have one, otherwise make a request
|
| 41 |
+
response = requests.get(download_url, timeout=10)
|
| 42 |
+
response.raise_for_status()
|
| 43 |
+
|
| 44 |
+
filename = self.url_to_filename(url, response)
|
| 45 |
+
filepath = os.path.join(MY_CONFIG.CRAWL_DIR, filename)
|
| 46 |
+
|
| 47 |
+
# Handle binary files vs text files based on mime type
|
| 48 |
+
mime_type = response.headers.get('Content-Type', '').lower()
|
| 49 |
+
is_text = mime_type.startswith('text/') or 'html' in mime_type or 'xml' in mime_type
|
| 50 |
+
|
| 51 |
+
if is_text:
|
| 52 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 53 |
+
f.write(response.text)
|
| 54 |
+
else:
|
| 55 |
+
with open(filepath, 'wb') as f:
|
| 56 |
+
f.write(response.content)
|
| 57 |
+
|
| 58 |
+
self.downloaded_count += 1
|
| 59 |
+
logger.info(f"Saved {filepath} with fragment ({self.downloaded_count}/{self.max_downloads})")
|
| 60 |
+
|
| 61 |
+
return [] # Don't re-parse links from same content
|
| 62 |
+
else:
|
| 63 |
+
logger.info(f"Skipping already downloaded URL: {download_url}")
|
| 64 |
+
return []
|
| 65 |
+
|
| 66 |
+
response = requests.get(download_url, timeout=10)
|
| 67 |
+
response.raise_for_status()
|
| 68 |
+
|
| 69 |
+
# Track that we've downloaded this base URL
|
| 70 |
+
self.downloaded_base_urls.add(download_url)
|
| 71 |
+
|
| 72 |
+
# Save file using original URL (with fragment) for unique filename
|
| 73 |
+
filename = self.url_to_filename(url, response)
|
| 74 |
+
filepath = os.path.join(MY_CONFIG.CRAWL_DIR, filename)
|
| 75 |
+
|
| 76 |
+
# Handle binary files vs text files based on mime type
|
| 77 |
+
mime_type = response.headers.get('Content-Type', '').lower()
|
| 78 |
+
is_text = mime_type.startswith('text/') or 'html' in mime_type or 'xml' in mime_type
|
| 79 |
+
|
| 80 |
+
if is_text:
|
| 81 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 82 |
+
f.write(response.text)
|
| 83 |
+
else:
|
| 84 |
+
with open(filepath, 'wb') as f:
|
| 85 |
+
f.write(response.content)
|
| 86 |
+
|
| 87 |
+
self.downloaded_count += 1
|
| 88 |
+
logger.info(f"Saved {filepath} ({self.downloaded_count}/{self.max_downloads})")
|
| 89 |
+
|
| 90 |
+
# Parse for links if not at max depth
|
| 91 |
+
links = []
|
| 92 |
+
if current_depth < self.depth:
|
| 93 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 94 |
+
base_domain = urlparse(self.url).netloc
|
| 95 |
+
for link in soup.find_all('a', href=True):
|
| 96 |
+
full_url = urljoin(url, link.get('href'))
|
| 97 |
+
if urlparse(full_url).netloc == base_domain:
|
| 98 |
+
links.append(full_url)
|
| 99 |
+
|
| 100 |
+
return links
|
| 101 |
+
|
| 102 |
+
except Exception as e:
|
| 103 |
+
logger.error(f"Error scraping {url}: {str(e)}")
|
| 104 |
+
return []
|
| 105 |
+
|
| 106 |
+
def url_to_filename(self, url, response):
|
| 107 |
+
# Keep domain and path, strip protocol, use __ for directory separators
|
| 108 |
+
parsed = urlparse(url)
|
| 109 |
+
domain = parsed.netloc
|
| 110 |
+
path = parsed.path
|
| 111 |
+
fragment = parsed.fragment
|
| 112 |
+
|
| 113 |
+
if not path or path == '/':
|
| 114 |
+
filename = f"{domain}__index"
|
| 115 |
+
else:
|
| 116 |
+
filename = f"{domain}{path.replace('/', '__')}"
|
| 117 |
+
|
| 118 |
+
# Add fragment (anchor) to filename if present
|
| 119 |
+
if fragment:
|
| 120 |
+
filename = f"{filename}__{fragment}"
|
| 121 |
+
|
| 122 |
+
filename = re.sub(r'[^\w\-_.]', '_', filename)
|
| 123 |
+
|
| 124 |
+
mime_type = response.headers.get('Content-Type')
|
| 125 |
+
if mime_type:
|
| 126 |
+
inferred_extension = mimetypes.guess_extension(mime_type.split(';')[0].strip())
|
| 127 |
+
else:
|
| 128 |
+
inferred_extension = '.html'
|
| 129 |
+
|
| 130 |
+
current_ext = os.path.splitext(filename)[1]
|
| 131 |
+
ext = os.path.splitext(filename)[1]
|
| 132 |
+
# print ('--- filename:', filename) # Debugging line
|
| 133 |
+
# print ('--- mimetype:', mime_type) # Debugging line
|
| 134 |
+
# print ('--- inferred_extension', inferred_extension) # Debugging line
|
| 135 |
+
# print ('--- current_ext:', current_ext) # Debugging line
|
| 136 |
+
|
| 137 |
+
# Only append .html if no extension exists
|
| 138 |
+
if not filename.endswith(inferred_extension):
|
| 139 |
+
filename = f"{filename}.html"
|
| 140 |
+
|
| 141 |
+
# print ('--- returning filename:', filename) # Debugging line
|
| 142 |
+
return filename
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
def scrape(self):
|
| 146 |
+
shutil.rmtree(MY_CONFIG.CRAWL_DIR, ignore_errors=True)
|
| 147 |
+
os.makedirs(MY_CONFIG.CRAWL_DIR, exist_ok=True)
|
| 148 |
+
logger.info(f"✅ Cleared crawl directory: {MY_CONFIG.CRAWL_DIR}")
|
| 149 |
+
|
| 150 |
+
logger.info(f"⚙ Starting scrape of {self.url}, max downloads: {self.max_downloads}, depth: {self.depth}")
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
urls_to_visit = [(self.url, 0)] # (url, depth)
|
| 154 |
+
|
| 155 |
+
while urls_to_visit and self.downloaded_count < self.max_downloads:
|
| 156 |
+
current_url, current_depth = urls_to_visit.pop(0)
|
| 157 |
+
|
| 158 |
+
if current_url in self.visited_urls:
|
| 159 |
+
continue
|
| 160 |
+
|
| 161 |
+
self.visited_urls.add(current_url)
|
| 162 |
+
|
| 163 |
+
links = self.scrape_page(current_url, current_depth)
|
| 164 |
+
|
| 165 |
+
# Add new URLs if not at max depth
|
| 166 |
+
if current_depth < self.depth:
|
| 167 |
+
for link in links:
|
| 168 |
+
if link not in self.visited_urls:
|
| 169 |
+
urls_to_visit.append((link, current_depth + 1))
|
| 170 |
+
|
| 171 |
+
time.sleep(MY_CONFIG.WAITTIME_BETWEEN_REQUESTS)
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
def main():
|
| 175 |
+
parser = argparse.ArgumentParser(description="Web scraper")
|
| 176 |
+
parser.add_argument("--url", type=str, default=MY_CONFIG.WEBSITE_URL, help=f"URL to scrape (default: {MY_CONFIG.WEBSITE_URL})")
|
| 177 |
+
parser.add_argument("--max-downloads", type=int, default=MY_CONFIG.CRAWL_MAX_DOWNLOADS, help=f"Maximum number of files to download (default: {MY_CONFIG.CRAWL_MAX_DOWNLOADS})")
|
| 178 |
+
parser.add_argument("--depth", type=int, default=MY_CONFIG.CRAWL_MAX_DEPTH, help=f"Maximum depth to crawl (default: {MY_CONFIG.CRAWL_MAX_DEPTH})")
|
| 179 |
+
|
| 180 |
+
args = parser.parse_args()
|
| 181 |
+
|
| 182 |
+
scraper = WebScraper(args.url, args.max_downloads, args.depth)
|
| 183 |
+
scraper.scrape()
|
| 184 |
+
|
| 185 |
+
logger.info(f"✅ Scraping completed. Downloaded {scraper.downloaded_count} files to '{MY_CONFIG.CRAWL_DIR}' directory.")
|
| 186 |
+
|
| 187 |
+
if __name__ == "__main__":
|
| 188 |
+
main()
|
2_process_files.ipynb
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"metadata": {},
|
| 6 |
+
"source": [
|
| 7 |
+
"# Processing HTML Files\n",
|
| 8 |
+
"\n",
|
| 9 |
+
"We will be using **docling**\n",
|
| 10 |
+
"\n",
|
| 11 |
+
"References\n",
|
| 12 |
+
"- [docling](https://github.com/DS4SD/docling)"
|
| 13 |
+
]
|
| 14 |
+
},
|
| 15 |
+
{
|
| 16 |
+
"cell_type": "markdown",
|
| 17 |
+
"metadata": {},
|
| 18 |
+
"source": [
|
| 19 |
+
"## Step-1: Data\n",
|
| 20 |
+
"\n",
|
| 21 |
+
"We will process data that is downloaded using [1_crawl_site.ipynb](1_crawl_site.ipynb).\n",
|
| 22 |
+
"\n",
|
| 23 |
+
"We have a couple of crawled HTML files in `input` directory. "
|
| 24 |
+
]
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"cell_type": "markdown",
|
| 28 |
+
"metadata": {},
|
| 29 |
+
"source": [
|
| 30 |
+
"## Step-2: Configuration"
|
| 31 |
+
]
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"cell_type": "code",
|
| 35 |
+
"execution_count": 1,
|
| 36 |
+
"metadata": {},
|
| 37 |
+
"outputs": [],
|
| 38 |
+
"source": [
|
| 39 |
+
"## All config is defined here\n",
|
| 40 |
+
"from my_config import MY_CONFIG"
|
| 41 |
+
]
|
| 42 |
+
},
|
| 43 |
+
{
|
| 44 |
+
"cell_type": "code",
|
| 45 |
+
"execution_count": 2,
|
| 46 |
+
"metadata": {},
|
| 47 |
+
"outputs": [
|
| 48 |
+
{
|
| 49 |
+
"name": "stdout",
|
| 50 |
+
"output_type": "stream",
|
| 51 |
+
"text": [
|
| 52 |
+
"✅ Cleared processed data directory : workspace/processed\n"
|
| 53 |
+
]
|
| 54 |
+
}
|
| 55 |
+
],
|
| 56 |
+
"source": [
|
| 57 |
+
"import os, sys\n",
|
| 58 |
+
"import shutil\n",
|
| 59 |
+
"\n",
|
| 60 |
+
"shutil.rmtree(MY_CONFIG.PROCESSED_DATA_DIR, ignore_errors=True)\n",
|
| 61 |
+
"shutil.os.makedirs(MY_CONFIG.PROCESSED_DATA_DIR, exist_ok=True)\n",
|
| 62 |
+
"print (f\"✅ Cleared processed data directory : {MY_CONFIG.PROCESSED_DATA_DIR}\")"
|
| 63 |
+
]
|
| 64 |
+
},
|
| 65 |
+
{
|
| 66 |
+
"cell_type": "markdown",
|
| 67 |
+
"metadata": {},
|
| 68 |
+
"source": [
|
| 69 |
+
"## Step-3: Convet FILES --> MD\n",
|
| 70 |
+
"\n",
|
| 71 |
+
"Process HTML documents and extract the text in markdown format"
|
| 72 |
+
]
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"cell_type": "code",
|
| 76 |
+
"execution_count": null,
|
| 77 |
+
"metadata": {},
|
| 78 |
+
"outputs": [],
|
| 79 |
+
"source": [
|
| 80 |
+
"%%time \n",
|
| 81 |
+
"\n",
|
| 82 |
+
"import os\n",
|
| 83 |
+
"import sys\n",
|
| 84 |
+
"from pathlib import Path\n",
|
| 85 |
+
"from docling.document_converter import DocumentConverter\n",
|
| 86 |
+
"\n",
|
| 87 |
+
"converter = DocumentConverter(format_options={\"preserve_links\": True})\n",
|
| 88 |
+
"\n",
|
| 89 |
+
"input_path = Path(MY_CONFIG.CRAWL_DIR)\n",
|
| 90 |
+
"input_files = list(input_path.glob('*.html')) + list(input_path.glob('*.htm')) + list(input_path.glob('*.pdf'))\n",
|
| 91 |
+
"print (f\"Found {len(input_files)} files to convert\")\n",
|
| 92 |
+
"\n",
|
| 93 |
+
"files_processed = 0\n",
|
| 94 |
+
"errors = 0\n",
|
| 95 |
+
"for input_file in input_files:\n",
|
| 96 |
+
" try:\n",
|
| 97 |
+
" result = converter.convert(input_file)\n",
|
| 98 |
+
" markdown_content = result.document.export_to_markdown()\n",
|
| 99 |
+
" \n",
|
| 100 |
+
" md_file_name = os.path.join(MY_CONFIG.PROCESSED_DATA_DIR, f\"{input_file.stem}.md\")\n",
|
| 101 |
+
" with open(md_file_name, \"w\", encoding=\"utf-8\") as md_file:\n",
|
| 102 |
+
" md_file.write(markdown_content)\n",
|
| 103 |
+
" \n",
|
| 104 |
+
" print (f\"Converted '{input_file}' --> '{md_file_name}'\")\n",
|
| 105 |
+
" files_processed += 1\n",
|
| 106 |
+
" except Exception as e:\n",
|
| 107 |
+
" errors += 1\n",
|
| 108 |
+
" print (f\"Error processing {input_file}: {e}\")\n",
|
| 109 |
+
"\n",
|
| 110 |
+
"print (f\"✅ Processed {files_processed} files. Errors: {errors}\")"
|
| 111 |
+
]
|
| 112 |
+
}
|
| 113 |
+
],
|
| 114 |
+
"metadata": {
|
| 115 |
+
"kernelspec": {
|
| 116 |
+
"display_name": "allycat-1",
|
| 117 |
+
"language": "python",
|
| 118 |
+
"name": "python3"
|
| 119 |
+
},
|
| 120 |
+
"language_info": {
|
| 121 |
+
"codemirror_mode": {
|
| 122 |
+
"name": "ipython",
|
| 123 |
+
"version": 3
|
| 124 |
+
},
|
| 125 |
+
"file_extension": ".py",
|
| 126 |
+
"mimetype": "text/x-python",
|
| 127 |
+
"name": "python",
|
| 128 |
+
"nbconvert_exporter": "python",
|
| 129 |
+
"pygments_lexer": "ipython3",
|
| 130 |
+
"version": "3.11.12"
|
| 131 |
+
}
|
| 132 |
+
},
|
| 133 |
+
"nbformat": 4,
|
| 134 |
+
"nbformat_minor": 2
|
| 135 |
+
}
|
2_process_files.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os, sys
|
| 2 |
+
import shutil
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from docling.document_converter import DocumentConverter
|
| 5 |
+
import html2text
|
| 6 |
+
import logging
|
| 7 |
+
import hashlib
|
| 8 |
+
from my_config import MY_CONFIG
|
| 9 |
+
|
| 10 |
+
logging.basicConfig(level=logging.WARNING, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
logger.setLevel(logging.INFO)
|
| 13 |
+
|
| 14 |
+
def cleanup_duplicate_markdown_files(processed_dir):
|
| 15 |
+
"""
|
| 16 |
+
Remove duplicate markdown files based on content hash.
|
| 17 |
+
Keeps the first file encountered for each unique content.
|
| 18 |
+
"""
|
| 19 |
+
processed_path = Path(processed_dir)
|
| 20 |
+
md_files = list(processed_path.glob('*.md'))
|
| 21 |
+
|
| 22 |
+
if not md_files:
|
| 23 |
+
logger.info("No markdown files found for deduplication")
|
| 24 |
+
return 0
|
| 25 |
+
|
| 26 |
+
content_hashes = {}
|
| 27 |
+
duplicates_removed = 0
|
| 28 |
+
|
| 29 |
+
for md_file in md_files:
|
| 30 |
+
try:
|
| 31 |
+
with open(md_file, 'r', encoding='utf-8') as f:
|
| 32 |
+
content = f.read()
|
| 33 |
+
|
| 34 |
+
content_hash = hashlib.md5(content.encode('utf-8')).hexdigest()
|
| 35 |
+
|
| 36 |
+
if content_hash in content_hashes:
|
| 37 |
+
os.remove(md_file)
|
| 38 |
+
duplicates_removed += 1
|
| 39 |
+
logger.info(f"Removed duplicate: {md_file} (same content as {content_hashes[content_hash]})")
|
| 40 |
+
else:
|
| 41 |
+
content_hashes[content_hash] = md_file
|
| 42 |
+
|
| 43 |
+
except Exception as e:
|
| 44 |
+
logger.warning(f"Error processing {md_file} for deduplication: {e}")
|
| 45 |
+
|
| 46 |
+
logger.info(f"✅ Deduplication complete. Removed {duplicates_removed} duplicate files")
|
| 47 |
+
return duplicates_removed
|
| 48 |
+
## --- end of cleanup_duplicate_markdown_files ---
|
| 49 |
+
|
| 50 |
+
def process_files(crawl_dir, processed_dir):
|
| 51 |
+
"""
|
| 52 |
+
Process all files in the crawl directory and convert them to markdown.
|
| 53 |
+
Uses html2text for HTML/HTM files and docling for PDFs and other documents.
|
| 54 |
+
|
| 55 |
+
Args:
|
| 56 |
+
crawl_dir (str): Directory containing files to process
|
| 57 |
+
processed_dir (str): Directory to save processed markdown files
|
| 58 |
+
"""
|
| 59 |
+
|
| 60 |
+
input_path = Path(crawl_dir)
|
| 61 |
+
input_files = list(input_path.glob('*'))
|
| 62 |
+
logger.info (f"Found {len(input_files)} files to process in {input_path}")
|
| 63 |
+
|
| 64 |
+
shutil.rmtree(processed_dir, ignore_errors=True)
|
| 65 |
+
shutil.os.makedirs(processed_dir, exist_ok=True)
|
| 66 |
+
logger.info (f"✅ Cleared processed data directory : {processed_dir}")
|
| 67 |
+
|
| 68 |
+
# Initialize converters
|
| 69 |
+
docling_converter = DocumentConverter(format_options={"preserve_links": True})
|
| 70 |
+
html_converter = html2text.HTML2Text()
|
| 71 |
+
html_converter.ignore_links = False
|
| 72 |
+
html_converter.ignore_images = False
|
| 73 |
+
|
| 74 |
+
files_processed = 0
|
| 75 |
+
errors = 0
|
| 76 |
+
file_type_stats = {}
|
| 77 |
+
|
| 78 |
+
for input_file in input_files:
|
| 79 |
+
file_ext = input_file.suffix.lower()
|
| 80 |
+
markdown_content = None
|
| 81 |
+
|
| 82 |
+
try:
|
| 83 |
+
# Process HTML/HTM files with html2text
|
| 84 |
+
if file_ext in ['.html', '.htm']:
|
| 85 |
+
with open(input_file, 'r', encoding='utf-8', errors='ignore') as f:
|
| 86 |
+
html_content = f.read()
|
| 87 |
+
markdown_content = html_converter.handle(html_content)
|
| 88 |
+
logger.debug(f"Converted HTML '{input_file}' with html2text")
|
| 89 |
+
|
| 90 |
+
# Process TXT files directly
|
| 91 |
+
elif file_ext == '.txt':
|
| 92 |
+
with open(input_file, 'r', encoding='utf-8', errors='ignore') as f:
|
| 93 |
+
markdown_content = f.read()
|
| 94 |
+
logger.debug(f"Processed TXT '{input_file}' directly")
|
| 95 |
+
|
| 96 |
+
# Process PDF and other documents with docling
|
| 97 |
+
else:
|
| 98 |
+
result = docling_converter.convert(input_file)
|
| 99 |
+
markdown_content = result.document.export_to_markdown()
|
| 100 |
+
logger.debug(f"Converted '{input_file}' with docling")
|
| 101 |
+
|
| 102 |
+
# Save markdown file
|
| 103 |
+
if markdown_content:
|
| 104 |
+
md_file_name = os.path.join(processed_dir, f"{input_file.stem}.md")
|
| 105 |
+
with open(md_file_name, "w", encoding="utf-8") as md_file:
|
| 106 |
+
md_file.write(markdown_content)
|
| 107 |
+
|
| 108 |
+
files_processed += 1
|
| 109 |
+
file_type_stats[file_ext] = file_type_stats.get(file_ext, 0) + 1
|
| 110 |
+
|
| 111 |
+
except Exception as e:
|
| 112 |
+
errors += 1
|
| 113 |
+
logger.warning(f"Error processing {input_file}: {e}")
|
| 114 |
+
|
| 115 |
+
logger.info (f"✅ Processed {files_processed} files. Errors: {errors}")
|
| 116 |
+
|
| 117 |
+
# Print file type statistics in compact dictionary format
|
| 118 |
+
if file_type_stats:
|
| 119 |
+
logger.info(f"📊 File type statistics: {dict(sorted(file_type_stats.items()))}")
|
| 120 |
+
|
| 121 |
+
return files_processed, errors, file_type_stats
|
| 122 |
+
## --- end of process_files ---
|
| 123 |
+
|
| 124 |
+
def main():
|
| 125 |
+
"""
|
| 126 |
+
Main function to run the file processing pipeline.
|
| 127 |
+
"""
|
| 128 |
+
logger.info("🚀 Starting file processing pipeline")
|
| 129 |
+
|
| 130 |
+
try:
|
| 131 |
+
files_processed, errors, file_type_stats = process_files(MY_CONFIG.CRAWL_DIR, MY_CONFIG.PROCESSED_DATA_DIR)
|
| 132 |
+
duplicates_removed = cleanup_duplicate_markdown_files(MY_CONFIG.PROCESSED_DATA_DIR)
|
| 133 |
+
logger.info(f"✅ Final summary: {files_processed} files processed, {errors} errors, {duplicates_removed} duplicates removed")
|
| 134 |
+
logger.info("✅ File processing pipeline completed successfully")
|
| 135 |
+
return 0
|
| 136 |
+
except Exception as e:
|
| 137 |
+
logger.error(f"❌ File processing pipeline failed: {e}")
|
| 138 |
+
return 1
|
| 139 |
+
|
| 140 |
+
if __name__ == "__main__":
|
| 141 |
+
sys.exit(main())
|
2b_process_graph_phase1.py
ADDED
|
@@ -0,0 +1,881 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
GraphRAG Phase 1: LLM-based Entity and Relationship Extraction
|
| 3 |
+
Builds initial knowledge graph from markdown files using LLMs (Cerebras or Gemini)
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import json
|
| 7 |
+
import logging
|
| 8 |
+
import os
|
| 9 |
+
import time
|
| 10 |
+
import uuid
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from typing import Any, Dict, List
|
| 13 |
+
from datetime import datetime
|
| 14 |
+
import orjson
|
| 15 |
+
from json_repair import repair_json
|
| 16 |
+
import google.generativeai as genai
|
| 17 |
+
import openai
|
| 18 |
+
from my_config import MY_CONFIG
|
| 19 |
+
|
| 20 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 21 |
+
logger = logging.getLogger(__name__)
|
| 22 |
+
|
| 23 |
+
class GraphBuilder:
|
| 24 |
+
|
| 25 |
+
def __init__(self, llm_provider="cerebras"):
|
| 26 |
+
self.llm_provider = llm_provider.lower()
|
| 27 |
+
|
| 28 |
+
# Global entity registry for deduplication across files
|
| 29 |
+
self.global_entity_registry = {}
|
| 30 |
+
|
| 31 |
+
# Initialize graph data structure
|
| 32 |
+
self.graph_data = {"nodes": [], "relationships": []}
|
| 33 |
+
self.processed_files = 0
|
| 34 |
+
|
| 35 |
+
# Initialize LLM API based on provider
|
| 36 |
+
if self.llm_provider == "cerebras":
|
| 37 |
+
if not MY_CONFIG.CEREBRAS_API_KEY:
|
| 38 |
+
raise ValueError("CEREBRAS_API_KEY environment variable not set. Get free key at: https://cloud.cerebras.ai/")
|
| 39 |
+
|
| 40 |
+
# Configure Cerebras client
|
| 41 |
+
self.cerebras_client = openai.OpenAI(
|
| 42 |
+
api_key=MY_CONFIG.CEREBRAS_API_KEY,
|
| 43 |
+
base_url="https://api.cerebras.ai/v1"
|
| 44 |
+
)
|
| 45 |
+
self.model_name = "llama-4-scout-17b-16e-instruct"
|
| 46 |
+
logger.info("🚀 Using Cerebras API")
|
| 47 |
+
|
| 48 |
+
elif self.llm_provider == "gemini":
|
| 49 |
+
if not MY_CONFIG.GEMINI_API_KEY:
|
| 50 |
+
raise ValueError("GEMINI_API_KEY environment variable not set. Get free key at: https://aistudio.google.com/")
|
| 51 |
+
|
| 52 |
+
# Configure Gemini with FREE tier
|
| 53 |
+
genai.configure(api_key=MY_CONFIG.GEMINI_API_KEY)
|
| 54 |
+
self.model_name = "gemini-1.5-flash"
|
| 55 |
+
self.gemini_model = genai.GenerativeModel(self.model_name)
|
| 56 |
+
logger.info("🆓 Using Google Gemini API,)")
|
| 57 |
+
|
| 58 |
+
else:
|
| 59 |
+
valid_providers = ["cerebras", "gemini"]
|
| 60 |
+
raise ValueError(f"Invalid provider '{llm_provider}'. Choose from: {valid_providers}")
|
| 61 |
+
|
| 62 |
+
# Configure extraction parameters
|
| 63 |
+
self.min_entities = int(os.getenv("GRAPH_MIN_ENTITIES", "5"))
|
| 64 |
+
self.max_entities = int(os.getenv("GRAPH_MAX_ENTITIES", "15"))
|
| 65 |
+
self.min_relationships = int(os.getenv("GRAPH_MIN_RELATIONSHIPS", "3"))
|
| 66 |
+
self.max_relationships = int(os.getenv("GRAPH_MAX_RELATIONSHIPS", "8"))
|
| 67 |
+
self.min_confidence = float(os.getenv("GRAPH_MIN_CONFIDENCE", "0.8"))
|
| 68 |
+
self.max_content_chars = int(os.getenv("GRAPH_MAX_CONTENT_CHARS", "12000"))
|
| 69 |
+
self.sentence_boundary_ratio = float(os.getenv("GRAPH_SENTENCE_BOUNDARY_RATIO", "0.7"))
|
| 70 |
+
|
| 71 |
+
logger.info(f"✅ Initialized {self.llm_provider.upper()} provider with model: {self.model_name}")
|
| 72 |
+
logger.info(f"Extraction config: {self.min_entities}-{self.max_entities} entities, {self.min_relationships}-{self.max_relationships} relationships, min confidence: {self.min_confidence}")
|
| 73 |
+
logger.info(f"Content processing: {self.max_content_chars} chars per chunk with overlap for FULL analysis")
|
| 74 |
+
|
| 75 |
+
# STEP 0: Clean Graph Data Folder
|
| 76 |
+
def clean_graph_folder(self, graph_dir: str = None):
|
| 77 |
+
if graph_dir is None:
|
| 78 |
+
graph_dir = "workspace/graph_data"
|
| 79 |
+
try:
|
| 80 |
+
graph_path = Path(graph_dir)
|
| 81 |
+
if graph_path.exists():
|
| 82 |
+
# Remove all files in the directory
|
| 83 |
+
for file_path in graph_path.glob("*"):
|
| 84 |
+
if file_path.is_file():
|
| 85 |
+
file_path.unlink()
|
| 86 |
+
logger.debug(f"Removed: {file_path.name}")
|
| 87 |
+
logger.info(f"Cleaned graph folder: {graph_dir}")
|
| 88 |
+
else:
|
| 89 |
+
# Create directory if it doesn't exist
|
| 90 |
+
graph_path.mkdir(parents=True, exist_ok=True)
|
| 91 |
+
logger.info(f"Created graph folder: {graph_dir}")
|
| 92 |
+
except Exception as e:
|
| 93 |
+
logger.warning(f"Failed to clean graph folder: {e}")
|
| 94 |
+
|
| 95 |
+
# STEP 1: Content Preprocessing and Chunking
|
| 96 |
+
def _preprocess_content(self, text: str, max_chars: int = None) -> str:
|
| 97 |
+
# Remove excessive whitespace but keep full content
|
| 98 |
+
text = ' '.join(text.split())
|
| 99 |
+
return text.strip()
|
| 100 |
+
|
| 101 |
+
def _chunk_content(self, text: str, chunk_size: int = None, overlap: int = 200) -> List[str]:
|
| 102 |
+
if chunk_size is None:
|
| 103 |
+
chunk_size = self.max_content_chars
|
| 104 |
+
|
| 105 |
+
# If content fits in one chunk, return as-is
|
| 106 |
+
if len(text) <= chunk_size:
|
| 107 |
+
return [text]
|
| 108 |
+
|
| 109 |
+
chunks = []
|
| 110 |
+
start = 0
|
| 111 |
+
|
| 112 |
+
while start < len(text):
|
| 113 |
+
# Calculate end position
|
| 114 |
+
end = start + chunk_size
|
| 115 |
+
|
| 116 |
+
if end >= len(text):
|
| 117 |
+
# Last chunk
|
| 118 |
+
chunks.append(text[start:])
|
| 119 |
+
break
|
| 120 |
+
|
| 121 |
+
# Try to find good break point (sentence boundary)
|
| 122 |
+
chunk_text = text[start:end]
|
| 123 |
+
last_period = chunk_text.rfind('.')
|
| 124 |
+
last_newline = chunk_text.rfind('\n')
|
| 125 |
+
|
| 126 |
+
# Use best break point
|
| 127 |
+
break_point = max(last_period, last_newline)
|
| 128 |
+
if break_point > chunk_size * 0.7: # Good break point
|
| 129 |
+
actual_end = start + break_point + 1
|
| 130 |
+
chunks.append(text[start:actual_end])
|
| 131 |
+
start = actual_end - overlap # Overlap for context
|
| 132 |
+
else:
|
| 133 |
+
# No good break point, use hard split
|
| 134 |
+
chunks.append(text[start:end])
|
| 135 |
+
start = end - overlap
|
| 136 |
+
|
| 137 |
+
return chunks
|
| 138 |
+
|
| 139 |
+
# STEP 2: LLM Prompt Generation
|
| 140 |
+
def get_entity_extraction_prompt(self) -> str:
|
| 141 |
+
return f"""You are a specialized knowledge graph extraction assistant. Your task is to analyze content and extract entities and relationships to build comprehensive knowledge graphs.
|
| 142 |
+
|
| 143 |
+
DYNAMIC EXTRACTION REQUIREMENTS:
|
| 144 |
+
- Extract {self.min_entities}-{self.max_entities} most important entities from the content
|
| 145 |
+
- Create {self.min_relationships}-{self.max_relationships} meaningful relationships between entities
|
| 146 |
+
- Confidence threshold: {self.min_confidence} (only include high-confidence extractions)
|
| 147 |
+
- Focus on extracting diverse entity types relevant to the content domain
|
| 148 |
+
|
| 149 |
+
CONSTITUTIONAL AI PRINCIPLES:
|
| 150 |
+
1. Content-Adaptive: Determine entity types based on content analysis, not predefined categories
|
| 151 |
+
2. Relationship-Rich: Focus on meaningful semantic relationships between entities
|
| 152 |
+
3. Context-Aware: Consider document context and domain when extracting entities
|
| 153 |
+
4. Quality-First: Prioritize extraction quality over quantity
|
| 154 |
+
|
| 155 |
+
ENTITY EXTRACTION GUIDELINES:
|
| 156 |
+
- Identify the most important concepts, terms, people, places, organizations, technologies, events
|
| 157 |
+
- Extract entities that would be valuable for knowledge graph queries
|
| 158 |
+
- Include both explicit entities (directly mentioned) and implicit entities (strongly implied)
|
| 159 |
+
- Assign appropriate types based on semantic analysis of the entity's role in the content
|
| 160 |
+
|
| 161 |
+
RELATIONSHIP EXTRACTION GUIDELINES:
|
| 162 |
+
- Create relationships that capture semantic meaning, not just co-occurrence
|
| 163 |
+
- Use descriptive relationship types that express the nature of the connection
|
| 164 |
+
- Include hierarchical, associative, and causal relationships where appropriate
|
| 165 |
+
- Ensure relationships are bidirectionally meaningful and contextually accurate
|
| 166 |
+
|
| 167 |
+
OUTPUT FORMAT (strict JSON):
|
| 168 |
+
{{
|
| 169 |
+
"entities": [
|
| 170 |
+
{{
|
| 171 |
+
"text": "Entity Name",
|
| 172 |
+
"type": "DynamicType",
|
| 173 |
+
"content": "Comprehensive description of the entity",
|
| 174 |
+
"confidence": 0.95
|
| 175 |
+
}}
|
| 176 |
+
],
|
| 177 |
+
"relationships": [
|
| 178 |
+
{{
|
| 179 |
+
"startNode": "Entity Name 1",
|
| 180 |
+
"endNode": "Entity Name 2",
|
| 181 |
+
"type": "DESCRIPTIVE_RELATIONSHIP_TYPE",
|
| 182 |
+
"description": "Clear description of the relationship",
|
| 183 |
+
"evidence": "Direct evidence from text supporting this relationship",
|
| 184 |
+
"confidence": 0.90
|
| 185 |
+
}}
|
| 186 |
+
]
|
| 187 |
+
}}
|
| 188 |
+
|
| 189 |
+
IMPORTANT: Respond with ONLY the JSON object. No explanations, no markdown formatting, no code blocks."""
|
| 190 |
+
|
| 191 |
+
# STEP 3: LLM Inference Methods
|
| 192 |
+
def _cerebras_inference(self, system_prompt: str, user_prompt: str) -> str:
|
| 193 |
+
try:
|
| 194 |
+
# Cerebras uses OpenAI-compatible chat format
|
| 195 |
+
response = self.cerebras_client.chat.completions.create(
|
| 196 |
+
model=self.model_name,
|
| 197 |
+
messages=[
|
| 198 |
+
{"role": "system", "content": system_prompt},
|
| 199 |
+
{"role": "user", "content": user_prompt}
|
| 200 |
+
],
|
| 201 |
+
temperature=0.1,
|
| 202 |
+
max_tokens=2000
|
| 203 |
+
)
|
| 204 |
+
|
| 205 |
+
# Check for empty response
|
| 206 |
+
if not response or not response.choices or not response.choices[0].message.content:
|
| 207 |
+
raise ValueError("Empty response from Cerebras")
|
| 208 |
+
|
| 209 |
+
return response.choices[0].message.content.strip()
|
| 210 |
+
|
| 211 |
+
except Exception as e:
|
| 212 |
+
# Check for quota/rate limit exceeded errors
|
| 213 |
+
error_str = str(e).lower()
|
| 214 |
+
if "429" in str(e) and "quota" in error_str:
|
| 215 |
+
logger.error(f"🚫 QUOTA EXCEEDED: Cerebras API rate/quota limit reached - {e}")
|
| 216 |
+
raise Exception("QUOTA_EXCEEDED") from e
|
| 217 |
+
else:
|
| 218 |
+
logger.error(f"Error with Cerebras inference: {e}")
|
| 219 |
+
raise e
|
| 220 |
+
|
| 221 |
+
def _gemini_inference(self, system_prompt: str, user_prompt: str) -> str:
|
| 222 |
+
try:
|
| 223 |
+
combined_prompt = f"{system_prompt}\n\n{user_prompt}"
|
| 224 |
+
response = self.gemini_model.generate_content(combined_prompt)
|
| 225 |
+
if not response or not response.text:
|
| 226 |
+
raise ValueError("Empty response from Gemini")
|
| 227 |
+
|
| 228 |
+
return response.text.strip()
|
| 229 |
+
|
| 230 |
+
except Exception as e:
|
| 231 |
+
# Check for quota exceeded error
|
| 232 |
+
if "429" in str(e) and "quota" in str(e).lower():
|
| 233 |
+
logger.error(f"🚫 QUOTA EXCEEDED: Gemini API daily limit reached - {e}")
|
| 234 |
+
raise Exception("QUOTA_EXCEEDED") from e
|
| 235 |
+
else:
|
| 236 |
+
logger.error(f"Error with Gemini inference: {e}")
|
| 237 |
+
raise e
|
| 238 |
+
|
| 239 |
+
# STEP 4: JSON Parsing Pipeline
|
| 240 |
+
def _smart_json_parse(self, json_text: str) -> Dict[str, Any]:
|
| 241 |
+
|
| 242 |
+
cleaned_text = json_text.strip()
|
| 243 |
+
|
| 244 |
+
# Step 1: orjson
|
| 245 |
+
try:
|
| 246 |
+
result = orjson.loads(cleaned_text.encode('utf-8'))
|
| 247 |
+
logger.debug("✅ Step 1: orjson succeeded")
|
| 248 |
+
return result
|
| 249 |
+
except Exception as e:
|
| 250 |
+
logger.debug(f"❌ Step 1: orjson failed - {e}")
|
| 251 |
+
|
| 252 |
+
# Step 2: json-repair
|
| 253 |
+
try:
|
| 254 |
+
repaired = repair_json(cleaned_text)
|
| 255 |
+
result = orjson.loads(repaired.encode('utf-8'))
|
| 256 |
+
logger.debug("✅ Step 2: json-repair + orjson succeeded")
|
| 257 |
+
return result
|
| 258 |
+
except Exception as e:
|
| 259 |
+
logger.debug(f"❌ Step 2: json-repair failed - {e}")
|
| 260 |
+
|
| 261 |
+
# Step 3: standard json
|
| 262 |
+
try:
|
| 263 |
+
result = json.loads(cleaned_text)
|
| 264 |
+
logger.debug("✅ Step 3: standard json succeeded")
|
| 265 |
+
return result
|
| 266 |
+
except Exception as e:
|
| 267 |
+
logger.debug(f"❌ Step 3: standard json failed - {e}")
|
| 268 |
+
|
| 269 |
+
# Step 4: json-repair + standard json
|
| 270 |
+
try:
|
| 271 |
+
repaired = repair_json(cleaned_text)
|
| 272 |
+
result = json.loads(repaired)
|
| 273 |
+
logger.debug("✅ Step 4: json-repair + standard json succeeded")
|
| 274 |
+
return result
|
| 275 |
+
except Exception as e:
|
| 276 |
+
logger.debug(f"❌ Step 4: json-repair + standard json failed - {e}")
|
| 277 |
+
|
| 278 |
+
# Step 5: All failed - this will trigger save failed txt files
|
| 279 |
+
raise ValueError("All 4 JSON parsing steps failed")
|
| 280 |
+
|
| 281 |
+
# STEP 5: Response Parsing and Validation
|
| 282 |
+
def _parse_llm_extraction_response(self, llm_response: str, file_name: str) -> Dict[str, Any]:
|
| 283 |
+
|
| 284 |
+
# Clean up response first
|
| 285 |
+
cleaned_response = llm_response.strip()
|
| 286 |
+
|
| 287 |
+
# Remove markdown formatting
|
| 288 |
+
if "```json" in cleaned_response:
|
| 289 |
+
parts = cleaned_response.split("```json")
|
| 290 |
+
if len(parts) > 1:
|
| 291 |
+
json_part = parts[1].split("```")[0].strip()
|
| 292 |
+
cleaned_response = json_part
|
| 293 |
+
elif "```" in cleaned_response:
|
| 294 |
+
parts = cleaned_response.split("```")
|
| 295 |
+
if len(parts) >= 3:
|
| 296 |
+
cleaned_response = parts[1].strip()
|
| 297 |
+
|
| 298 |
+
# Use the 5-step JSON parsing pipeline
|
| 299 |
+
try:
|
| 300 |
+
extraction_data = self._smart_json_parse(cleaned_response)
|
| 301 |
+
|
| 302 |
+
# Validate complete format
|
| 303 |
+
if self._validate_complete_format(extraction_data):
|
| 304 |
+
return extraction_data
|
| 305 |
+
else:
|
| 306 |
+
self._save_failed_response(cleaned_response, file_name, "Format validation failed", "Missing required fields or empty values")
|
| 307 |
+
return None
|
| 308 |
+
except Exception as e:
|
| 309 |
+
logger.error(f"❌ All JSON parsing steps failed for file {file_name}: {str(e)}")
|
| 310 |
+
self._save_failed_response(cleaned_response, file_name, "All parsing steps failed", str(e))
|
| 311 |
+
return None
|
| 312 |
+
|
| 313 |
+
# STEP 6: Format Validation
|
| 314 |
+
def _validate_complete_format(self, extraction_data: Dict[str, Any]) -> bool:
|
| 315 |
+
|
| 316 |
+
if not isinstance(extraction_data, dict):
|
| 317 |
+
return False
|
| 318 |
+
|
| 319 |
+
if "entities" not in extraction_data or "relationships" not in extraction_data:
|
| 320 |
+
return False
|
| 321 |
+
|
| 322 |
+
entities = extraction_data.get("entities", [])
|
| 323 |
+
relationships = extraction_data.get("relationships", [])
|
| 324 |
+
if not isinstance(entities, list) or len(entities) == 0:
|
| 325 |
+
return False
|
| 326 |
+
for entity in entities:
|
| 327 |
+
if not isinstance(entity, dict):
|
| 328 |
+
return False
|
| 329 |
+
|
| 330 |
+
required_fields = ["text", "type", "content", "confidence"]
|
| 331 |
+
for field in required_fields:
|
| 332 |
+
if field not in entity:
|
| 333 |
+
return False
|
| 334 |
+
value = entity[field]
|
| 335 |
+
if value is None or value == "" or (isinstance(value, str) and not value.strip()):
|
| 336 |
+
return False
|
| 337 |
+
|
| 338 |
+
if not isinstance(entity["confidence"], (int, float)) or entity["confidence"] <= 0:
|
| 339 |
+
return False
|
| 340 |
+
|
| 341 |
+
if isinstance(relationships, list):
|
| 342 |
+
for rel in relationships:
|
| 343 |
+
if not isinstance(rel, dict):
|
| 344 |
+
return False
|
| 345 |
+
|
| 346 |
+
required_fields = ["startNode", "endNode", "type", "description", "evidence", "confidence"]
|
| 347 |
+
for field in required_fields:
|
| 348 |
+
if field not in rel:
|
| 349 |
+
return False
|
| 350 |
+
value = rel[field]
|
| 351 |
+
if value is None or value == "" or (isinstance(value, str) and not value.strip()):
|
| 352 |
+
return False
|
| 353 |
+
|
| 354 |
+
if not isinstance(rel["confidence"], (int, float)) or rel["confidence"] <= 0:
|
| 355 |
+
return False
|
| 356 |
+
|
| 357 |
+
return True
|
| 358 |
+
|
| 359 |
+
# STEP 7: Error Handling and Failed Response Logging
|
| 360 |
+
def _save_failed_response(self, llm_response: str, file_name: str, _json_error: str, _repair_error: str):
|
| 361 |
+
try:
|
| 362 |
+
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
| 363 |
+
output_dir = Path("workspace/graph_data")
|
| 364 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 365 |
+
|
| 366 |
+
with open(output_dir / "failed_responses.txt", 'a', encoding='utf-8') as f:
|
| 367 |
+
f.write(f"# Failed response from file: {file_name} at {timestamp}\n")
|
| 368 |
+
f.write(llm_response)
|
| 369 |
+
f.write("\n---\n")
|
| 370 |
+
f.flush()
|
| 371 |
+
|
| 372 |
+
except Exception as save_error:
|
| 373 |
+
logger.error(f"Failed to save failed response from {file_name}: {save_error}")
|
| 374 |
+
|
| 375 |
+
# STEP 8: Main Entity Extraction
|
| 376 |
+
def extract_entities_with_llm(self, content: str, file_name: str) -> Dict[str, Any]:
|
| 377 |
+
# Preprocess content
|
| 378 |
+
processed_content = self._preprocess_content(content)
|
| 379 |
+
|
| 380 |
+
# Split into chunks
|
| 381 |
+
chunks = self._chunk_content(processed_content)
|
| 382 |
+
|
| 383 |
+
logger.info(f"📄 Processing {file_name}: {len(processed_content)} chars in {len(chunks)} chunk(s)")
|
| 384 |
+
|
| 385 |
+
# Collect all entities and relationships from chunks
|
| 386 |
+
all_entities = []
|
| 387 |
+
all_relationships = []
|
| 388 |
+
|
| 389 |
+
for chunk_idx, chunk in enumerate(chunks):
|
| 390 |
+
logger.info(f"🔄 Processing chunk {chunk_idx + 1}/{len(chunks)} for {file_name}")
|
| 391 |
+
|
| 392 |
+
# Simple retry mechanism for empty content - just send to LLM again
|
| 393 |
+
max_retries = 3
|
| 394 |
+
for attempt in range(max_retries):
|
| 395 |
+
# Get the optimized prompt for entity extraction based on provider
|
| 396 |
+
system_prompt = self.get_entity_extraction_prompt()
|
| 397 |
+
|
| 398 |
+
# Create user prompt with chunk content
|
| 399 |
+
chunk_info = f" (chunk {chunk_idx + 1}/{len(chunks)})" if len(chunks) > 1 else ""
|
| 400 |
+
user_prompt = f"""
|
| 401 |
+
Analyze the following content from file "{file_name}"{chunk_info}:
|
| 402 |
+
|
| 403 |
+
```
|
| 404 |
+
{chunk}
|
| 405 |
+
```
|
| 406 |
+
|
| 407 |
+
Extract all relevant entities, concepts, and their relationships from this content.
|
| 408 |
+
"""
|
| 409 |
+
|
| 410 |
+
# Call appropriate LLM API
|
| 411 |
+
try:
|
| 412 |
+
if self.llm_provider == "gemini":
|
| 413 |
+
llm_response = self._gemini_inference(system_prompt, user_prompt)
|
| 414 |
+
elif self.llm_provider == "cerebras":
|
| 415 |
+
llm_response = self._cerebras_inference(system_prompt, user_prompt)
|
| 416 |
+
else:
|
| 417 |
+
raise ValueError(f"Unsupported LLM provider: {self.llm_provider}")
|
| 418 |
+
except Exception as e:
|
| 419 |
+
if "QUOTA_EXCEEDED" in str(e):
|
| 420 |
+
logger.error(f"🚫 QUOTA EXCEEDED on file {file_name}, chunk {chunk_idx + 1} - stopping processing")
|
| 421 |
+
# Return partial results if we have any
|
| 422 |
+
return {
|
| 423 |
+
"entities": all_entities,
|
| 424 |
+
"relationships": all_relationships,
|
| 425 |
+
"file": file_name,
|
| 426 |
+
"structure": {"section": "partial_quota_exceeded"},
|
| 427 |
+
"chunks_processed": chunk_idx,
|
| 428 |
+
"total_content_length": len(processed_content),
|
| 429 |
+
"quota_exceeded": True
|
| 430 |
+
}
|
| 431 |
+
else:
|
| 432 |
+
raise e
|
| 433 |
+
|
| 434 |
+
# Parse the JSON response
|
| 435 |
+
result = self._parse_llm_extraction_response(llm_response, f"{file_name}_chunk_{chunk_idx}")
|
| 436 |
+
if result is not None or attempt == max_retries - 1:
|
| 437 |
+
if result is None:
|
| 438 |
+
logger.warning(f"❌ Chunk {chunk_idx + 1} of {file_name} failed all validation attempts, skipping")
|
| 439 |
+
break
|
| 440 |
+
|
| 441 |
+
# Chunk results to collections
|
| 442 |
+
chunk_entities = result.get("entities", [])
|
| 443 |
+
chunk_relationships = result.get("relationships", [])
|
| 444 |
+
|
| 445 |
+
# Add chunk identifier to entities for deduplication
|
| 446 |
+
for entity in chunk_entities:
|
| 447 |
+
entity["chunk_id"] = chunk_idx
|
| 448 |
+
entity["source_chunk"] = f"chunk_{chunk_idx}"
|
| 449 |
+
|
| 450 |
+
# Add chunk identifier to relationships
|
| 451 |
+
for rel in chunk_relationships:
|
| 452 |
+
rel["chunk_id"] = chunk_idx
|
| 453 |
+
rel["source_chunk"] = f"chunk_{chunk_idx}"
|
| 454 |
+
|
| 455 |
+
all_entities.extend(chunk_entities)
|
| 456 |
+
all_relationships.extend(chunk_relationships)
|
| 457 |
+
|
| 458 |
+
logger.info(f"✅ Chunk {chunk_idx + 1}: {len(chunk_entities)} entities, {len(chunk_relationships)} relationships")
|
| 459 |
+
break
|
| 460 |
+
else:
|
| 461 |
+
logger.info(f"Chunk {chunk_idx + 1} attempt {attempt + 1}/{max_retries}: Validation failed, retrying")
|
| 462 |
+
|
| 463 |
+
# Deduplicate entities across chunks (same entity name = same entity)
|
| 464 |
+
unique_entities = {}
|
| 465 |
+
for entity in all_entities:
|
| 466 |
+
entity_key = entity.get("text", "").lower().strip()
|
| 467 |
+
if entity_key and entity_key not in unique_entities:
|
| 468 |
+
unique_entities[entity_key] = entity
|
| 469 |
+
elif entity_key:
|
| 470 |
+
# Merge information from duplicate entities
|
| 471 |
+
existing = unique_entities[entity_key]
|
| 472 |
+
existing["confidence"] = max(existing.get("confidence", 0), entity.get("confidence", 0))
|
| 473 |
+
# Combine descriptions
|
| 474 |
+
existing_desc = existing.get("content", "")
|
| 475 |
+
new_desc = entity.get("content", "")
|
| 476 |
+
if new_desc and new_desc not in existing_desc:
|
| 477 |
+
existing["content"] = f"{existing_desc}; {new_desc}".strip("; ")
|
| 478 |
+
|
| 479 |
+
# Deduplicate relationships (same startNode+endNode+type = same relationship)
|
| 480 |
+
unique_relationships = {}
|
| 481 |
+
for rel in all_relationships:
|
| 482 |
+
rel_key = f"{rel.get('startNode', '').lower()}||{rel.get('endNode', '').lower()}||{rel.get('type', '').lower()}"
|
| 483 |
+
if rel_key and rel_key not in unique_relationships:
|
| 484 |
+
unique_relationships[rel_key] = rel
|
| 485 |
+
elif rel_key:
|
| 486 |
+
# Keep highest confidence relationship
|
| 487 |
+
existing = unique_relationships[rel_key]
|
| 488 |
+
if rel.get("confidence", 0) > existing.get("confidence", 0):
|
| 489 |
+
unique_relationships[rel_key] = rel
|
| 490 |
+
|
| 491 |
+
final_entities = list(unique_entities.values())
|
| 492 |
+
final_relationships = list(unique_relationships.values())
|
| 493 |
+
|
| 494 |
+
logger.info(f"Final results for {file_name}: {len(final_entities)} unique entities, {len(final_relationships)} unique relationships")
|
| 495 |
+
|
| 496 |
+
return {
|
| 497 |
+
"entities": final_entities,
|
| 498 |
+
"relationships": final_relationships,
|
| 499 |
+
"file": file_name,
|
| 500 |
+
"structure": {"section": "full_analysis"},
|
| 501 |
+
"chunks_processed": len(chunks),
|
| 502 |
+
"total_content_length": len(processed_content)
|
| 503 |
+
}
|
| 504 |
+
|
| 505 |
+
|
| 506 |
+
|
| 507 |
+
# STEP 9: Single File Processing
|
| 508 |
+
def process_md_file(self, md_file_path: str) -> Dict[str, Any]:
|
| 509 |
+
logger.info(f"Processing: {md_file_path}")
|
| 510 |
+
|
| 511 |
+
try:
|
| 512 |
+
# Read file content
|
| 513 |
+
with open(md_file_path, 'r', encoding='utf-8') as f:
|
| 514 |
+
content = f.read()
|
| 515 |
+
|
| 516 |
+
file_name = os.path.basename(md_file_path)
|
| 517 |
+
|
| 518 |
+
# Extract entities and relationships using LLM-only approach
|
| 519 |
+
llm_data = self.extract_entities_with_llm(content, file_name)
|
| 520 |
+
|
| 521 |
+
# Use LLM data - create nodes and relationships from validated data
|
| 522 |
+
entities_added = 0
|
| 523 |
+
relationships_added = 0
|
| 524 |
+
|
| 525 |
+
# Check if quota was exceeded during extraction
|
| 526 |
+
quota_exceeded = llm_data.get("quota_exceeded", False)
|
| 527 |
+
if quota_exceeded:
|
| 528 |
+
return {
|
| 529 |
+
"file": file_name,
|
| 530 |
+
"status": "quota_exceeded",
|
| 531 |
+
"entities_extracted": len(llm_data.get("entities", [])),
|
| 532 |
+
"unique_entities_added": 0,
|
| 533 |
+
"relationships_generated": 0,
|
| 534 |
+
"processed_at": datetime.now().isoformat(),
|
| 535 |
+
"error": "API quota exceeded during processing"
|
| 536 |
+
}
|
| 537 |
+
|
| 538 |
+
# Process entities from LLM
|
| 539 |
+
for entity in llm_data.get("entities", []):
|
| 540 |
+
entity_text = entity["text"]
|
| 541 |
+
semantic_key = entity_text.lower().strip()
|
| 542 |
+
|
| 543 |
+
# Add to global registry if new
|
| 544 |
+
if semantic_key not in self.global_entity_registry:
|
| 545 |
+
# Use LLM data directly
|
| 546 |
+
entity["id"] = str(uuid.uuid4())
|
| 547 |
+
entity["source_file"] = file_name
|
| 548 |
+
|
| 549 |
+
self.global_entity_registry[semantic_key] = entity
|
| 550 |
+
self.graph_data["nodes"].append(entity)
|
| 551 |
+
entities_added += 1
|
| 552 |
+
|
| 553 |
+
# Process relationships from LLM
|
| 554 |
+
for rel in llm_data.get("relationships", []):
|
| 555 |
+
# Apply confidence threshold filtering
|
| 556 |
+
rel_confidence = rel.get("confidence", 0.0)
|
| 557 |
+
if rel_confidence < self.min_confidence:
|
| 558 |
+
continue # Skip low-confidence relationships
|
| 559 |
+
|
| 560 |
+
start_text = rel["startNode"].lower().strip()
|
| 561 |
+
end_text = rel["endNode"].lower().strip()
|
| 562 |
+
|
| 563 |
+
# Only create if both entities exist
|
| 564 |
+
if start_text in self.global_entity_registry and end_text in self.global_entity_registry:
|
| 565 |
+
# Use original relationship type without sanitization
|
| 566 |
+
original_type = rel["type"]
|
| 567 |
+
|
| 568 |
+
# Create clean relationship with only Neo4j fields
|
| 569 |
+
clean_rel = {
|
| 570 |
+
"id": str(uuid.uuid4()),
|
| 571 |
+
"startNode": self.global_entity_registry[start_text]["id"],
|
| 572 |
+
"endNode": self.global_entity_registry[end_text]["id"],
|
| 573 |
+
"type": original_type, # Use original type preserving semantic meaning
|
| 574 |
+
"description": rel.get("description", ""),
|
| 575 |
+
"evidence": rel.get("evidence", ""),
|
| 576 |
+
"confidence": rel_confidence,
|
| 577 |
+
"chunk_id": rel.get("chunk_id", 0),
|
| 578 |
+
"source_chunk": rel.get("source_chunk", ""),
|
| 579 |
+
"source_file": file_name
|
| 580 |
+
}
|
| 581 |
+
|
| 582 |
+
self.graph_data["relationships"].append(clean_rel)
|
| 583 |
+
relationships_added += 1
|
| 584 |
+
|
| 585 |
+
result = {
|
| 586 |
+
"file": file_name,
|
| 587 |
+
"status": "success",
|
| 588 |
+
"entities_extracted": len(llm_data.get("entities", [])),
|
| 589 |
+
"unique_entities_added": entities_added,
|
| 590 |
+
"relationships_generated": relationships_added,
|
| 591 |
+
"processed_at": datetime.now().isoformat()
|
| 592 |
+
}
|
| 593 |
+
|
| 594 |
+
self.processed_files += 1
|
| 595 |
+
logger.info(f"✅ Processed {file_name}: {entities_added} new entities, {relationships_added} relationships")
|
| 596 |
+
return result
|
| 597 |
+
|
| 598 |
+
except Exception as e:
|
| 599 |
+
logger.error(f"❌ Error processing {md_file_path}: {e}")
|
| 600 |
+
return {
|
| 601 |
+
"file": os.path.basename(md_file_path),
|
| 602 |
+
"status": "error",
|
| 603 |
+
"error": str(e),
|
| 604 |
+
"processed_at": datetime.now().isoformat()
|
| 605 |
+
}
|
| 606 |
+
|
| 607 |
+
# STEP 10: Batch File Processing
|
| 608 |
+
def process_all_md_files(self, input_dir: str = None, output_path: str = None) -> Dict[str, Any]:
|
| 609 |
+
if input_dir is None:
|
| 610 |
+
input_dir = "workspace/processed"
|
| 611 |
+
if output_path is None:
|
| 612 |
+
output_path = os.path.join("workspace/graph_data", "graph-data-initial.json")
|
| 613 |
+
|
| 614 |
+
# Clean the graph folder before starting fresh processing
|
| 615 |
+
graph_dir = os.path.dirname(output_path)
|
| 616 |
+
self.clean_graph_folder(graph_dir)
|
| 617 |
+
|
| 618 |
+
input_path = Path(input_dir)
|
| 619 |
+
md_files = list(input_path.glob("**/*.md")) # Include subdirectories
|
| 620 |
+
|
| 621 |
+
# Ensure output directory exists
|
| 622 |
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
| 623 |
+
|
| 624 |
+
if not md_files:
|
| 625 |
+
logger.warning(f"No markdown files found in {input_dir}")
|
| 626 |
+
return {"status": "no_files", "message": "No markdown files found"}
|
| 627 |
+
|
| 628 |
+
logger.info(f"Found {len(md_files)} markdown files to process")
|
| 629 |
+
|
| 630 |
+
# Reset data structures for a clean batch processing
|
| 631 |
+
self.graph_data = {"nodes": [], "relationships": []}
|
| 632 |
+
self.global_entity_registry = {} # Reset global registry
|
| 633 |
+
self.processed_files = 0
|
| 634 |
+
|
| 635 |
+
logger.info(f"🚀 Starting document processing with Neo4j format output ({self.llm_provider.upper()})...")
|
| 636 |
+
|
| 637 |
+
# Process files with progress tracking
|
| 638 |
+
results = []
|
| 639 |
+
processed_successfully = []
|
| 640 |
+
failed_files = []
|
| 641 |
+
quota_exceeded_files = []
|
| 642 |
+
start_time = time.time()
|
| 643 |
+
|
| 644 |
+
for i, md_file in enumerate(md_files, 1):
|
| 645 |
+
file_start_time = time.time()
|
| 646 |
+
logger.info(f"Processing file {i}/{len(md_files)}: {md_file.name}")
|
| 647 |
+
|
| 648 |
+
# Track registry size before processing
|
| 649 |
+
initial_registry_size = len(self.global_entity_registry)
|
| 650 |
+
initial_relationship_count = len(self.graph_data["relationships"])
|
| 651 |
+
|
| 652 |
+
# Process the file
|
| 653 |
+
result = self.process_md_file(str(md_file))
|
| 654 |
+
results.append(result)
|
| 655 |
+
|
| 656 |
+
# Track file status for detailed logging
|
| 657 |
+
file_status = result.get("status", "unknown")
|
| 658 |
+
if file_status == "success":
|
| 659 |
+
processed_successfully.append(md_file.name)
|
| 660 |
+
elif file_status == "quota_exceeded":
|
| 661 |
+
quota_exceeded_files.append(md_file.name)
|
| 662 |
+
logger.warning(f"🚫 QUOTA EXCEEDED - Stopping batch processing at file {i}/{len(md_files)}")
|
| 663 |
+
break # Stop processing when quota exceeded
|
| 664 |
+
else:
|
| 665 |
+
failed_files.append((md_file.name, result.get("error", "Unknown error")))
|
| 666 |
+
|
| 667 |
+
# Calculate processing metrics
|
| 668 |
+
file_time = time.time() - file_start_time
|
| 669 |
+
new_entities = len(self.global_entity_registry) - initial_registry_size
|
| 670 |
+
new_relationships = len(self.graph_data["relationships"]) - initial_relationship_count
|
| 671 |
+
|
| 672 |
+
# Show detailed progress information
|
| 673 |
+
logger.info(f" File processed in {file_time:.2f}s: {new_entities} new entities, {new_relationships} relationships")
|
| 674 |
+
|
| 675 |
+
# Show batch progress at regular intervals
|
| 676 |
+
if i % 5 == 0 or i == len(md_files):
|
| 677 |
+
successful_so_far = sum(1 for r in results if r.get("status") == "success")
|
| 678 |
+
elapsed = time.time() - start_time
|
| 679 |
+
avg_time = elapsed / i
|
| 680 |
+
remaining = avg_time * (len(md_files) - i)
|
| 681 |
+
|
| 682 |
+
logger.info(f"Progress: {i}/{len(md_files)} files ({successful_so_far} successful)")
|
| 683 |
+
logger.info(f" Current stats: {len(self.global_entity_registry)} unique entities, {len(self.graph_data['relationships'])} relationships")
|
| 684 |
+
logger.info(f"Time elapsed: {elapsed:.1f}s (avg {avg_time:.1f}s per file, ~{remaining:.1f}s remaining)")
|
| 685 |
+
|
| 686 |
+
# Generate comprehensive summary with detailed tracking
|
| 687 |
+
elapsed = time.time() - start_time
|
| 688 |
+
successful = len(processed_successfully)
|
| 689 |
+
quota_exceeded = len(quota_exceeded_files)
|
| 690 |
+
failed = len(failed_files)
|
| 691 |
+
unique_entities = len(self.global_entity_registry)
|
| 692 |
+
|
| 693 |
+
# Save detailed processing lists
|
| 694 |
+
self._save_processing_logs(processed_successfully, quota_exceeded_files, failed_files, output_path)
|
| 695 |
+
|
| 696 |
+
# Count entity types
|
| 697 |
+
entity_types = {}
|
| 698 |
+
for entity_info in self.global_entity_registry.values():
|
| 699 |
+
entity_type = entity_info["type"]
|
| 700 |
+
entity_types[entity_type] = entity_types.get(entity_type, 0) + 1
|
| 701 |
+
|
| 702 |
+
# Count relationship types
|
| 703 |
+
relationship_types = {}
|
| 704 |
+
for rel in self.graph_data["relationships"]:
|
| 705 |
+
rel_type = rel["type"]
|
| 706 |
+
relationship_types[rel_type] = relationship_types.get(rel_type, 0) + 1
|
| 707 |
+
|
| 708 |
+
summary = {
|
| 709 |
+
"status": "completed",
|
| 710 |
+
"total_files": len(md_files),
|
| 711 |
+
"successful": successful,
|
| 712 |
+
"quota_exceeded": quota_exceeded,
|
| 713 |
+
"failed": failed,
|
| 714 |
+
"unique_entities": unique_entities,
|
| 715 |
+
"total_relationships": len(self.graph_data["relationships"]),
|
| 716 |
+
"entity_types": entity_types,
|
| 717 |
+
"relationship_types": relationship_types,
|
| 718 |
+
"processing_time_seconds": elapsed,
|
| 719 |
+
"average_time_per_file": elapsed / len(md_files) if md_files else 0,
|
| 720 |
+
"model": self.model_name,
|
| 721 |
+
"llm_provider": self.llm_provider,
|
| 722 |
+
"processed_at": datetime.now().isoformat()
|
| 723 |
+
}
|
| 724 |
+
|
| 725 |
+
logger.info(f"✅ Processing complete in {elapsed:.1f}s: {successful}/{len(md_files)} files successful")
|
| 726 |
+
if quota_exceeded > 0:
|
| 727 |
+
logger.warning(f"🚫 {quota_exceeded} files hit quota limit")
|
| 728 |
+
if failed > 0:
|
| 729 |
+
logger.error(f"❌ {failed} files failed with errors")
|
| 730 |
+
logger.info(f"Final stats: {unique_entities} unique entities, {len(self.graph_data['relationships'])} relationships")
|
| 731 |
+
|
| 732 |
+
# Log entity and relationship type breakdown
|
| 733 |
+
logger.info("Entity types:")
|
| 734 |
+
for entity_type, count in sorted(entity_types.items(), key=lambda x: x[1], reverse=True)[:10]:
|
| 735 |
+
logger.info(f" - {entity_type}: {count}")
|
| 736 |
+
|
| 737 |
+
logger.info("Relationship types:")
|
| 738 |
+
for rel_type, count in sorted(relationship_types.items(), key=lambda x: x[1], reverse=True)[:10]:
|
| 739 |
+
logger.info(f" - {rel_type}: {count}")
|
| 740 |
+
|
| 741 |
+
return summary
|
| 742 |
+
|
| 743 |
+
# STEP 10.5: Processing Logs Tracking
|
| 744 |
+
def _save_processing_logs(self, successful_files: List[str], quota_exceeded_files: List[str], failed_files: List[tuple], output_path: str):
|
| 745 |
+
try:
|
| 746 |
+
output_dir = Path(output_path).parent
|
| 747 |
+
|
| 748 |
+
# Save successfully processed files
|
| 749 |
+
with open(output_dir / "processed_successfully.txt", 'w', encoding='utf-8') as f:
|
| 750 |
+
f.write(f"# Successfully Processed Files ({len(successful_files)} total)\n")
|
| 751 |
+
f.write(f"# Generated: {datetime.now().isoformat()}\n\n")
|
| 752 |
+
for file_name in successful_files:
|
| 753 |
+
f.write(f"{file_name}\n")
|
| 754 |
+
|
| 755 |
+
# Save quota exceeded files
|
| 756 |
+
if quota_exceeded_files:
|
| 757 |
+
with open(output_dir / "quota_exceeded_files.txt", 'w', encoding='utf-8') as f:
|
| 758 |
+
f.write(f"# Files That Hit Quota Limit ({len(quota_exceeded_files)} total)\n")
|
| 759 |
+
f.write(f"# Generated: {datetime.now().isoformat()}\n\n")
|
| 760 |
+
for file_name in quota_exceeded_files:
|
| 761 |
+
f.write(f"{file_name}\n")
|
| 762 |
+
|
| 763 |
+
# Save failed files with errors
|
| 764 |
+
if failed_files:
|
| 765 |
+
with open(output_dir / "failed_files.txt", 'w', encoding='utf-8') as f:
|
| 766 |
+
f.write(f"# Files That Failed Processing ({len(failed_files)} total)\n")
|
| 767 |
+
f.write(f"# Generated: {datetime.now().isoformat()}\n\n")
|
| 768 |
+
for file_name, error in failed_files:
|
| 769 |
+
f.write(f"{file_name}: {error}\n")
|
| 770 |
+
|
| 771 |
+
logger.info(f"📋 Processing logs saved to {output_dir}")
|
| 772 |
+
|
| 773 |
+
except Exception as e:
|
| 774 |
+
logger.error(f"❌ Failed to save processing logs: {e}")
|
| 775 |
+
|
| 776 |
+
# STEP 11: Graph Data Output
|
| 777 |
+
def save_graph_data(self, output_path: str = None) -> bool:
|
| 778 |
+
if output_path is None:
|
| 779 |
+
output_path = os.path.join("workspace/graph_data", "graph-data-initial.json")
|
| 780 |
+
try:
|
| 781 |
+
# Ensure output directory exists
|
| 782 |
+
output_dir = Path(output_path).parent
|
| 783 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 784 |
+
|
| 785 |
+
# Compile final data from global entity registry
|
| 786 |
+
final_nodes = []
|
| 787 |
+
|
| 788 |
+
for semantic_key, entity_info in self.global_entity_registry.items():
|
| 789 |
+
entity_id = entity_info["id"]
|
| 790 |
+
|
| 791 |
+
# Create Neo4j node
|
| 792 |
+
node = {
|
| 793 |
+
"id": entity_id,
|
| 794 |
+
"elementId": entity_id,
|
| 795 |
+
"labels": [entity_info["type"]],
|
| 796 |
+
"properties": {
|
| 797 |
+
"name": entity_info["text"],
|
| 798 |
+
"content": entity_info.get("content", ""),
|
| 799 |
+
"source": entity_info.get("source_file", ""),
|
| 800 |
+
"confidence": entity_info["confidence"],
|
| 801 |
+
"created_date": datetime.now().strftime("%Y-%m-%d"),
|
| 802 |
+
"extraction_method": self.llm_provider
|
| 803 |
+
}
|
| 804 |
+
}
|
| 805 |
+
final_nodes.append(node)
|
| 806 |
+
|
| 807 |
+
# Use relationships
|
| 808 |
+
final_relationships = self.graph_data["relationships"]
|
| 809 |
+
|
| 810 |
+
# Prepare final graph data
|
| 811 |
+
final_graph = {
|
| 812 |
+
"nodes": final_nodes,
|
| 813 |
+
"relationships": final_relationships,
|
| 814 |
+
"metadata": {
|
| 815 |
+
"node_count": len(final_nodes),
|
| 816 |
+
"relationship_count": len(final_relationships),
|
| 817 |
+
"generated_at": datetime.now().isoformat(),
|
| 818 |
+
"generator": "Allycat GraphBuilder",
|
| 819 |
+
"llm_provider": self.llm_provider,
|
| 820 |
+
"model": self.model_name,
|
| 821 |
+
"format_version": "neo4j-2025"
|
| 822 |
+
}
|
| 823 |
+
}
|
| 824 |
+
|
| 825 |
+
# Save final graph data
|
| 826 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
| 827 |
+
json.dump(final_graph, f, indent=2, ensure_ascii=False)
|
| 828 |
+
|
| 829 |
+
# Calculate final output size
|
| 830 |
+
output_size = os.path.getsize(output_path)
|
| 831 |
+
output_size_mb = output_size / (1024 * 1024)
|
| 832 |
+
|
| 833 |
+
logger.info(f"✅ Neo4j graph data saved to {output_path} ({output_size_mb:.2f} MB)")
|
| 834 |
+
logger.info(f"Final stats: {len(final_nodes)} nodes, {len(final_relationships)} relationships")
|
| 835 |
+
return True
|
| 836 |
+
|
| 837 |
+
except Exception as e:
|
| 838 |
+
logger.error(f"❌ Error saving graph data: {e}")
|
| 839 |
+
return False
|
| 840 |
+
|
| 841 |
+
# STEP 12: Main Entry Point
|
| 842 |
+
def main():
|
| 843 |
+
"""Main function to run the content analysis pipeline."""
|
| 844 |
+
logger.info(" Starting Content Analysis Pipeline (Cloud-based APIs)")
|
| 845 |
+
|
| 846 |
+
# Choose LLM provider from environment or default to cerebras
|
| 847 |
+
llm_provider = os.getenv("GRAPH_LLM_PROVIDER", "cerebras").lower()
|
| 848 |
+
logger.info(f" Using LLM provider: {llm_provider.upper()}")
|
| 849 |
+
|
| 850 |
+
# Validate provider choice
|
| 851 |
+
valid_providers = ["cerebras", "gemini"]
|
| 852 |
+
if llm_provider not in valid_providers:
|
| 853 |
+
logger.warning(f"⚠️ Invalid provider '{llm_provider}'. Using 'cerebras' (default)")
|
| 854 |
+
llm_provider = "cerebras"
|
| 855 |
+
|
| 856 |
+
try:
|
| 857 |
+
analyzer = GraphBuilder(llm_provider=llm_provider)
|
| 858 |
+
|
| 859 |
+
# Normal processing
|
| 860 |
+
summary = analyzer.process_all_md_files()
|
| 861 |
+
|
| 862 |
+
if summary["status"] == "no_files":
|
| 863 |
+
logger.warning("⚠️ No files to process")
|
| 864 |
+
return 1
|
| 865 |
+
|
| 866 |
+
if analyzer.save_graph_data():
|
| 867 |
+
logger.info("✅ Content Analysis completed successfully!")
|
| 868 |
+
logger.info(f" Results: {summary['successful']}/{summary['total_files']} files processed")
|
| 869 |
+
logger.info(f"Graph: {summary['unique_entities']} nodes, {summary['total_relationships']} relationships")
|
| 870 |
+
logger.info(f"Model used: {analyzer.model_name} via {llm_provider.upper()}")
|
| 871 |
+
return 0
|
| 872 |
+
else:
|
| 873 |
+
logger.error("❌ Failed to save graph data")
|
| 874 |
+
return 1
|
| 875 |
+
|
| 876 |
+
except Exception as e:
|
| 877 |
+
logger.error(f"❌ Pipeline failed: {e}")
|
| 878 |
+
return 1
|
| 879 |
+
|
| 880 |
+
if __name__ == "__main__":
|
| 881 |
+
exit(main())
|
2b_process_graph_phase2.py
ADDED
|
@@ -0,0 +1,427 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Phase 2: Community Detection using Leiden Algorithm
|
| 3 |
+
Loads graph-data-initial.json, runs community detection, saves graph-data-phase-2.json
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import json
|
| 7 |
+
import logging
|
| 8 |
+
import os
|
| 9 |
+
import time
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import Dict, Any
|
| 12 |
+
from collections import defaultdict
|
| 13 |
+
|
| 14 |
+
import networkx as nx
|
| 15 |
+
import igraph as ig
|
| 16 |
+
import leidenalg
|
| 17 |
+
import traceback
|
| 18 |
+
|
| 19 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 20 |
+
logger = logging.getLogger(__name__)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class GraphBuilderPhase2:
|
| 24 |
+
"""Phase 2: Detect communities using graph algorithms (NetworkX + Leiden)"""
|
| 25 |
+
|
| 26 |
+
def __init__(self):
|
| 27 |
+
"""Initialize Phase 2 processor"""
|
| 28 |
+
self.graph_data = None
|
| 29 |
+
self.nx_graph = None
|
| 30 |
+
self.community_result = None
|
| 31 |
+
self.community_stats = None
|
| 32 |
+
self.centrality_metrics = None
|
| 33 |
+
|
| 34 |
+
# Configuration from environment or defaults
|
| 35 |
+
self.min_community_size = int(os.getenv("GRAPH_MIN_COMMUNITY_SIZE", "5"))
|
| 36 |
+
self.leiden_resolution = float(os.getenv("GRAPH_LEIDEN_RESOLUTION", "1.0"))
|
| 37 |
+
self.leiden_iterations = int(os.getenv("GRAPH_LEIDEN_ITERATIONS", "-1")) # -1 = until convergence
|
| 38 |
+
self.leiden_seed = int(os.getenv("GRAPH_LEIDEN_SEED", "42"))
|
| 39 |
+
|
| 40 |
+
logger.info("✅ Phase 2 Initialized: Community Detection")
|
| 41 |
+
logger.info(f" - Min Community Size: {self.min_community_size}")
|
| 42 |
+
logger.info(f" - Leiden Resolution: {self.leiden_resolution}")
|
| 43 |
+
|
| 44 |
+
# STEP 1: Load Graph Data from Phase 1
|
| 45 |
+
def load_graph_data(self, input_path: str = None) -> bool:
|
| 46 |
+
"""Load graph data from the specified JSON file."""
|
| 47 |
+
if input_path is None:
|
| 48 |
+
input_path = "workspace/graph_data/graph-data-initial.json"
|
| 49 |
+
|
| 50 |
+
logger.info(f"Loading graph data from {input_path}...")
|
| 51 |
+
|
| 52 |
+
try:
|
| 53 |
+
input_file = Path(input_path)
|
| 54 |
+
if not input_file.exists():
|
| 55 |
+
logger.error(f"❌ Input file not found: {input_path}")
|
| 56 |
+
logger.warning(" Please run Phase 1 (2b_process_graph_phase1.py) to generate the graph data.")
|
| 57 |
+
return False
|
| 58 |
+
|
| 59 |
+
with open(input_file, 'r', encoding='utf-8') as f:
|
| 60 |
+
self.graph_data = json.load(f)
|
| 61 |
+
|
| 62 |
+
node_count = len(self.graph_data.get("nodes", []))
|
| 63 |
+
rel_count = len(self.graph_data.get("relationships", []))
|
| 64 |
+
|
| 65 |
+
logger.info(f" - Found {node_count} nodes and {rel_count} relationships")
|
| 66 |
+
|
| 67 |
+
if node_count == 0:
|
| 68 |
+
logger.error("❌ Graph data is empty. Cannot proceed.")
|
| 69 |
+
return False
|
| 70 |
+
|
| 71 |
+
return True
|
| 72 |
+
|
| 73 |
+
except Exception as e:
|
| 74 |
+
logger.error(f"❌ Error loading graph data: {e}")
|
| 75 |
+
return False
|
| 76 |
+
|
| 77 |
+
# STEP 2: Build NetworkX Graph
|
| 78 |
+
def _build_networkx_graph(self) -> nx.Graph:
|
| 79 |
+
"""Convert graph_data JSON to NetworkX graph for analysis"""
|
| 80 |
+
logger.info("Building NetworkX graph from JSON data...")
|
| 81 |
+
|
| 82 |
+
G = nx.Graph()
|
| 83 |
+
|
| 84 |
+
# Add nodes with attributes
|
| 85 |
+
for node in self.graph_data["nodes"]:
|
| 86 |
+
node_id = node["id"]
|
| 87 |
+
properties = node.get("properties", {})
|
| 88 |
+
|
| 89 |
+
G.add_node(
|
| 90 |
+
node_id,
|
| 91 |
+
name=properties.get("name", ""),
|
| 92 |
+
type=node.get("labels", ["Unknown"])[0],
|
| 93 |
+
description=properties.get("content", ""),
|
| 94 |
+
source=properties.get("source", ""),
|
| 95 |
+
confidence=properties.get("confidence", 0.0)
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
# Add edges with attributes
|
| 99 |
+
for rel in self.graph_data["relationships"]:
|
| 100 |
+
start_node = rel.get("startNode")
|
| 101 |
+
end_node = rel.get("endNode")
|
| 102 |
+
|
| 103 |
+
# Only add edge if both nodes exist
|
| 104 |
+
if start_node in G.nodes() and end_node in G.nodes():
|
| 105 |
+
G.add_edge(
|
| 106 |
+
start_node,
|
| 107 |
+
end_node,
|
| 108 |
+
type=rel.get("type", "RELATED_TO"),
|
| 109 |
+
evidence=rel.get("evidence", ""),
|
| 110 |
+
confidence=rel.get("confidence", 0.0)
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
logger.info(f"✅ Built NetworkX graph: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")
|
| 114 |
+
|
| 115 |
+
# Log basic graph statistics
|
| 116 |
+
if G.number_of_nodes() > 0:
|
| 117 |
+
density = nx.density(G)
|
| 118 |
+
logger.info(f"📊 Graph density: {density:.4f}")
|
| 119 |
+
|
| 120 |
+
if G.number_of_edges() > 0:
|
| 121 |
+
avg_degree = sum(dict(G.degree()).values()) / G.number_of_nodes()
|
| 122 |
+
logger.info(f"📊 Average degree: {avg_degree:.2f}")
|
| 123 |
+
|
| 124 |
+
return G
|
| 125 |
+
|
| 126 |
+
# STEP 3: Convert to igraph for Leiden
|
| 127 |
+
def _convert_to_igraph(self, G: nx.Graph) -> ig.Graph:
|
| 128 |
+
"""Convert NetworkX graph to igraph for Leiden algorithm"""
|
| 129 |
+
logger.info("🔄 Converting to igraph format for Leiden algorithm...")
|
| 130 |
+
|
| 131 |
+
# Create mapping from node IDs to indices
|
| 132 |
+
node_list = list(G.nodes())
|
| 133 |
+
node_to_idx = {node: idx for idx, node in enumerate(node_list)}
|
| 134 |
+
|
| 135 |
+
# Create edge list with indices
|
| 136 |
+
edges = [(node_to_idx[u], node_to_idx[v]) for u, v in G.edges()]
|
| 137 |
+
|
| 138 |
+
# Create igraph
|
| 139 |
+
ig_graph = ig.Graph(n=len(node_list), edges=edges, directed=False)
|
| 140 |
+
|
| 141 |
+
# Add node attributes
|
| 142 |
+
ig_graph.vs["name"] = [G.nodes[node].get("name", "") for node in node_list]
|
| 143 |
+
ig_graph.vs["node_id"] = node_list
|
| 144 |
+
|
| 145 |
+
logger.info(f"✅ Converted to igraph: {ig_graph.vcount()} vertices, {ig_graph.ecount()} edges")
|
| 146 |
+
|
| 147 |
+
return ig_graph
|
| 148 |
+
|
| 149 |
+
# STEP 4: Run Leiden Algorithm
|
| 150 |
+
def _run_leiden_algorithm(self, ig_graph: ig.Graph) -> Dict[str, Any]:
|
| 151 |
+
"""Run Leiden algorithm for community detection"""
|
| 152 |
+
logger.info("🔍 Running Leiden community detection algorithm...")
|
| 153 |
+
logger.info(f"Parameters: resolution={self.leiden_resolution}, iterations={self.leiden_iterations}, seed={self.leiden_seed}")
|
| 154 |
+
|
| 155 |
+
start_time = time.time()
|
| 156 |
+
|
| 157 |
+
try:
|
| 158 |
+
# Run Leiden algorithm
|
| 159 |
+
partition = leidenalg.find_partition(
|
| 160 |
+
ig_graph,
|
| 161 |
+
leidenalg.ModularityVertexPartition,
|
| 162 |
+
n_iterations=self.leiden_iterations,
|
| 163 |
+
seed=self.leiden_seed
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
# Extract community assignments
|
| 167 |
+
community_assignments = {}
|
| 168 |
+
for idx, community_id in enumerate(partition.membership):
|
| 169 |
+
node_id = ig_graph.vs[idx]["node_id"]
|
| 170 |
+
community_assignments[node_id] = community_id
|
| 171 |
+
|
| 172 |
+
# Calculate statistics
|
| 173 |
+
num_communities = len(set(partition.membership))
|
| 174 |
+
modularity = partition.modularity
|
| 175 |
+
|
| 176 |
+
elapsed = time.time() - start_time
|
| 177 |
+
|
| 178 |
+
logger.info(f"✅ Leiden algorithm completed in {elapsed:.2f}s")
|
| 179 |
+
logger.info(f"Detected {num_communities} communities")
|
| 180 |
+
logger.info(f"Modularity score: {modularity:.4f}")
|
| 181 |
+
|
| 182 |
+
return {
|
| 183 |
+
"assignments": community_assignments,
|
| 184 |
+
"num_communities": num_communities,
|
| 185 |
+
"modularity": modularity,
|
| 186 |
+
"algorithm": "Leiden",
|
| 187 |
+
"execution_time": elapsed
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
except Exception as e:
|
| 191 |
+
logger.error(f"❌ Leiden algorithm failed: {e}")
|
| 192 |
+
raise e
|
| 193 |
+
|
| 194 |
+
# STEP 5: Calculate Community Statistics
|
| 195 |
+
def _calculate_community_stats(self, G: nx.Graph, community_assignments: Dict[str, int]) -> Dict[int, Dict]:
|
| 196 |
+
"""Calculate statistics for each community"""
|
| 197 |
+
logger.info("Calculating community statistics...")
|
| 198 |
+
|
| 199 |
+
# Group nodes by community
|
| 200 |
+
communities = defaultdict(list)
|
| 201 |
+
for node_id, comm_id in community_assignments.items():
|
| 202 |
+
communities[comm_id].append(node_id)
|
| 203 |
+
|
| 204 |
+
# Calculate stats for each community
|
| 205 |
+
stats = {}
|
| 206 |
+
for comm_id, node_ids in communities.items():
|
| 207 |
+
# Skip very small communities if configured
|
| 208 |
+
if len(node_ids) < self.min_community_size:
|
| 209 |
+
logger.debug(f"Skipping small community {comm_id} with {len(node_ids)} members")
|
| 210 |
+
continue
|
| 211 |
+
|
| 212 |
+
subgraph = G.subgraph(node_ids)
|
| 213 |
+
|
| 214 |
+
stats[comm_id] = {
|
| 215 |
+
"member_count": len(node_ids),
|
| 216 |
+
"internal_edges": subgraph.number_of_edges(),
|
| 217 |
+
"density": nx.density(subgraph) if len(node_ids) > 1 else 0.0,
|
| 218 |
+
"avg_degree": sum(dict(subgraph.degree()).values()) / len(node_ids) if len(node_ids) > 0 else 0.0,
|
| 219 |
+
"member_ids": node_ids[:20] # Store top 20 for summary generation
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
logger.info(f"Calculated statistics for {len(stats)} communities (filtered by min_size={self.min_community_size})")
|
| 223 |
+
|
| 224 |
+
# Log top 5 largest communities
|
| 225 |
+
sorted_communities = sorted(stats.items(), key=lambda x: x[1]["member_count"], reverse=True)
|
| 226 |
+
logger.info("Top 5 largest communities:")
|
| 227 |
+
for comm_id, stat in sorted_communities[:5]:
|
| 228 |
+
logger.info(f" Community {comm_id}: {stat['member_count']} members, {stat['internal_edges']} edges, density={stat['density']:.3f}")
|
| 229 |
+
|
| 230 |
+
return stats
|
| 231 |
+
|
| 232 |
+
# STEP 6: Calculate Centrality Metrics
|
| 233 |
+
def _calculate_centrality_metrics(self, G: nx.Graph) -> Dict[str, Dict]:
|
| 234 |
+
"""Calculate centrality metrics for all nodes"""
|
| 235 |
+
logger.info("Calculating node centrality metrics...")
|
| 236 |
+
|
| 237 |
+
start_time = time.time()
|
| 238 |
+
|
| 239 |
+
# Degree centrality (fast, always calculate)
|
| 240 |
+
degree_centrality = nx.degree_centrality(G)
|
| 241 |
+
|
| 242 |
+
# Betweenness centrality (expensive, only for smaller graphs)
|
| 243 |
+
if G.number_of_nodes() < 5000:
|
| 244 |
+
logger.info(" Calculating betweenness centrality...")
|
| 245 |
+
betweenness_centrality = nx.betweenness_centrality(G, k=min(100, G.number_of_nodes()))
|
| 246 |
+
else:
|
| 247 |
+
logger.info(" Skipping betweenness centrality (graph too large)")
|
| 248 |
+
betweenness_centrality = {node: 0.0 for node in G.nodes()}
|
| 249 |
+
|
| 250 |
+
# Closeness centrality (expensive, only for smaller graphs)
|
| 251 |
+
if G.number_of_nodes() < 5000:
|
| 252 |
+
logger.info("Calculating closeness centrality...")
|
| 253 |
+
closeness_centrality = nx.closeness_centrality(G)
|
| 254 |
+
else:
|
| 255 |
+
logger.info(" Skipping closeness centrality (graph too large)")
|
| 256 |
+
closeness_centrality = {node: 0.0 for node in G.nodes()}
|
| 257 |
+
|
| 258 |
+
# Combine metrics
|
| 259 |
+
centrality_metrics = {}
|
| 260 |
+
for node in G.nodes():
|
| 261 |
+
centrality_metrics[node] = {
|
| 262 |
+
"degree": G.degree(node),
|
| 263 |
+
"degree_centrality": degree_centrality.get(node, 0.0),
|
| 264 |
+
"betweenness_centrality": betweenness_centrality.get(node, 0.0),
|
| 265 |
+
"closeness_centrality": closeness_centrality.get(node, 0.0)
|
| 266 |
+
}
|
| 267 |
+
|
| 268 |
+
elapsed = time.time() - start_time
|
| 269 |
+
logger.info(f"✅ Calculated centrality for {len(centrality_metrics)} nodes in {elapsed:.2f}s")
|
| 270 |
+
|
| 271 |
+
return centrality_metrics
|
| 272 |
+
|
| 273 |
+
# STEP 7: Add Community Data to Nodes
|
| 274 |
+
def _add_community_data_to_nodes(self, community_assignments: Dict[str, int], centrality_metrics: Dict[str, Dict]) -> None:
|
| 275 |
+
"""Add community_id and centrality metrics to node properties"""
|
| 276 |
+
logger.info("Adding community assignments and centrality to nodes...")
|
| 277 |
+
|
| 278 |
+
nodes_updated = 0
|
| 279 |
+
|
| 280 |
+
for node in self.graph_data["nodes"]:
|
| 281 |
+
node_id = node["id"]
|
| 282 |
+
|
| 283 |
+
# Add community_id
|
| 284 |
+
if node_id in community_assignments:
|
| 285 |
+
node["properties"]["community_id"] = f"comm-{community_assignments[node_id]}"
|
| 286 |
+
nodes_updated += 1
|
| 287 |
+
|
| 288 |
+
# Add centrality metrics
|
| 289 |
+
if node_id in centrality_metrics:
|
| 290 |
+
metrics = centrality_metrics[node_id]
|
| 291 |
+
node["properties"]["degree"] = metrics["degree"]
|
| 292 |
+
node["properties"]["degree_centrality"] = round(metrics["degree_centrality"], 4)
|
| 293 |
+
node["properties"]["betweenness_centrality"] = round(metrics["betweenness_centrality"], 4)
|
| 294 |
+
node["properties"]["closeness_centrality"] = round(metrics["closeness_centrality"], 4)
|
| 295 |
+
|
| 296 |
+
logger.info(f"✅ Updated {nodes_updated} nodes with community and centrality data")
|
| 297 |
+
|
| 298 |
+
# STEP 8: Main Processing Entry Point
|
| 299 |
+
def run_community_detection(self, input_path: str = None, output_path: str = None) -> bool:
|
| 300 |
+
"""Main entry point for Phase 2"""
|
| 301 |
+
if output_path is None:
|
| 302 |
+
output_path = "workspace/graph_data/graph-data-phase-2.json"
|
| 303 |
+
|
| 304 |
+
logger.info("🚀 Starting Phase 2: Community Detection")
|
| 305 |
+
logger.info("=" * 60)
|
| 306 |
+
|
| 307 |
+
start_time = time.time()
|
| 308 |
+
|
| 309 |
+
# Step 1: Load Phase 1 output
|
| 310 |
+
if not self.load_graph_data(input_path):
|
| 311 |
+
return False
|
| 312 |
+
|
| 313 |
+
# Step 2: Build NetworkX graph
|
| 314 |
+
self.nx_graph = self._build_networkx_graph()
|
| 315 |
+
|
| 316 |
+
if self.nx_graph.number_of_nodes() == 0:
|
| 317 |
+
logger.error("❌ Cannot run community detection on empty graph")
|
| 318 |
+
return False
|
| 319 |
+
|
| 320 |
+
# Step 3: Convert to igraph
|
| 321 |
+
ig_graph = self._convert_to_igraph(self.nx_graph)
|
| 322 |
+
|
| 323 |
+
# Step 4: Run Leiden algorithm
|
| 324 |
+
self.community_result = self._run_leiden_algorithm(ig_graph)
|
| 325 |
+
|
| 326 |
+
# Step 5: Calculate community statistics
|
| 327 |
+
self.community_stats = self._calculate_community_stats(
|
| 328 |
+
self.nx_graph,
|
| 329 |
+
self.community_result["assignments"]
|
| 330 |
+
)
|
| 331 |
+
|
| 332 |
+
# Step 6: Calculate centrality metrics
|
| 333 |
+
self.centrality_metrics = self._calculate_centrality_metrics(self.nx_graph)
|
| 334 |
+
|
| 335 |
+
# Step 7: Add community data to nodes
|
| 336 |
+
self._add_community_data_to_nodes(
|
| 337 |
+
self.community_result["assignments"],
|
| 338 |
+
self.centrality_metrics
|
| 339 |
+
)
|
| 340 |
+
|
| 341 |
+
# Step 8: Update metadata
|
| 342 |
+
self.graph_data["metadata"]["phase"] = "community_detection"
|
| 343 |
+
self.graph_data["metadata"]["community_detection"] = {
|
| 344 |
+
"algorithm": "Leiden",
|
| 345 |
+
"num_communities": self.community_result["num_communities"],
|
| 346 |
+
"modularity_score": round(self.community_result["modularity"], 4),
|
| 347 |
+
"execution_time_seconds": round(self.community_result["execution_time"], 2),
|
| 348 |
+
"min_community_size": self.min_community_size,
|
| 349 |
+
"resolution": self.leiden_resolution
|
| 350 |
+
}
|
| 351 |
+
|
| 352 |
+
# Step 9: Add community statistics to output
|
| 353 |
+
self.graph_data["community_stats"] = self.community_stats
|
| 354 |
+
|
| 355 |
+
# Step 10: Save Phase 2 output
|
| 356 |
+
if self._save_phase2_output(output_path):
|
| 357 |
+
elapsed = time.time() - start_time
|
| 358 |
+
logger.info("=" * 60)
|
| 359 |
+
logger.info(f"✅ Phase 2 completed successfully in {elapsed:.2f}s")
|
| 360 |
+
logger.info(f"Final stats:")
|
| 361 |
+
logger.info(f" - Communities detected: {self.community_result['num_communities']}")
|
| 362 |
+
logger.info(f" - Modularity score: {self.community_result['modularity']:.4f}")
|
| 363 |
+
logger.info(f" - Nodes with community assignments: {len(self.community_result['assignments'])}")
|
| 364 |
+
logger.info(f" - Output saved to: {output_path}")
|
| 365 |
+
return True
|
| 366 |
+
else:
|
| 367 |
+
return False
|
| 368 |
+
|
| 369 |
+
# STEP 9: Save Phase 2 Output
|
| 370 |
+
def _save_phase2_output(self, output_path: str) -> bool:
|
| 371 |
+
"""Save graph-data-phase-2.json"""
|
| 372 |
+
try:
|
| 373 |
+
# Ensure output directory exists
|
| 374 |
+
output_dir = Path(output_path).parent
|
| 375 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 376 |
+
|
| 377 |
+
# Save Phase 2 output
|
| 378 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
| 379 |
+
json.dump(self.graph_data, f, indent=2, ensure_ascii=False)
|
| 380 |
+
|
| 381 |
+
# Calculate file size
|
| 382 |
+
output_size = os.path.getsize(output_path)
|
| 383 |
+
output_size_mb = output_size / (1024 * 1024)
|
| 384 |
+
|
| 385 |
+
logger.info(f"Saved Phase 2 output: {output_path} ({output_size_mb:.2f} MB)")
|
| 386 |
+
|
| 387 |
+
return True
|
| 388 |
+
|
| 389 |
+
except Exception as e:
|
| 390 |
+
logger.error(f"❌ Error saving Phase 2 output: {e}")
|
| 391 |
+
return False
|
| 392 |
+
|
| 393 |
+
|
| 394 |
+
# STEP 10: Main Entry Point
|
| 395 |
+
def main():
|
| 396 |
+
"""Main function to run Phase 2: Community Detection"""
|
| 397 |
+
logger.info("🚀 GraphRAG Phase 2: Community Detection")
|
| 398 |
+
logger.info(" Input: graph-data-initial.json (from Phase 1)")
|
| 399 |
+
logger.info(" Output: graph-data-phase-2.json")
|
| 400 |
+
logger.info("")
|
| 401 |
+
|
| 402 |
+
try:
|
| 403 |
+
# Initialize Phase 2 processor
|
| 404 |
+
processor = GraphBuilderPhase2()
|
| 405 |
+
|
| 406 |
+
# Run community detection
|
| 407 |
+
success = processor.run_community_detection()
|
| 408 |
+
|
| 409 |
+
if success:
|
| 410 |
+
logger.info("")
|
| 411 |
+
logger.info("✅ Phase 2 completed successfully!")
|
| 412 |
+
logger.info("Next step: Run Phase 3 (2b_process_graph_phase3.py) for community summarization")
|
| 413 |
+
return 0
|
| 414 |
+
else:
|
| 415 |
+
logger.error("")
|
| 416 |
+
logger.error("❌ Phase 2 failed")
|
| 417 |
+
logger.error(" Please check the logs above for details")
|
| 418 |
+
return 1
|
| 419 |
+
|
| 420 |
+
except Exception as e:
|
| 421 |
+
logger.error(f"❌ Phase 2 pipeline failed: {e}")
|
| 422 |
+
logger.error(traceback.format_exc())
|
| 423 |
+
return 1
|
| 424 |
+
|
| 425 |
+
|
| 426 |
+
if __name__ == "__main__":
|
| 427 |
+
exit(main())
|
2b_process_graph_phase3.py
ADDED
|
@@ -0,0 +1,1096 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Phase 3: Community Summarization using LLM
|
| 3 |
+
Loads graph-data-phase-2.json, generates summaries, saves graph-data-final.json
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import json
|
| 7 |
+
import logging
|
| 8 |
+
import os
|
| 9 |
+
import time
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import Dict, Any, List
|
| 12 |
+
from datetime import datetime
|
| 13 |
+
from collections import defaultdict
|
| 14 |
+
|
| 15 |
+
import networkx as nx
|
| 16 |
+
import openai
|
| 17 |
+
import google.generativeai as genai
|
| 18 |
+
|
| 19 |
+
# JSON parsing libraries (same as Phase 1)
|
| 20 |
+
import orjson
|
| 21 |
+
from json_repair import repair_json
|
| 22 |
+
|
| 23 |
+
from my_config import MY_CONFIG
|
| 24 |
+
|
| 25 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 26 |
+
logger = logging.getLogger(__name__)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class GraphBuilderPhase3:
|
| 30 |
+
"""Phase 3: Generate community summaries using LLM"""
|
| 31 |
+
|
| 32 |
+
def __init__(self, llm_provider: str = "cerebras"):
|
| 33 |
+
"""Initialize Phase 3 processor"""
|
| 34 |
+
self.llm_provider = llm_provider.lower()
|
| 35 |
+
self.graph_data = None
|
| 36 |
+
self.nx_graph = None
|
| 37 |
+
self.community_assignments = {}
|
| 38 |
+
self.community_stats = {}
|
| 39 |
+
|
| 40 |
+
# Initialize LLM API based on provider
|
| 41 |
+
if self.llm_provider == "cerebras":
|
| 42 |
+
if not MY_CONFIG.CEREBRAS_API_KEY:
|
| 43 |
+
raise ValueError("CEREBRAS_API_KEY not set")
|
| 44 |
+
|
| 45 |
+
self.cerebras_client = openai.OpenAI(
|
| 46 |
+
api_key=MY_CONFIG.CEREBRAS_API_KEY,
|
| 47 |
+
base_url="https://api.cerebras.ai/v1"
|
| 48 |
+
)
|
| 49 |
+
self.model_name = "llama-4-scout-17b-16e-instruct"
|
| 50 |
+
logger.info("🚀 Using Cerebras API")
|
| 51 |
+
|
| 52 |
+
elif self.llm_provider == "gemini":
|
| 53 |
+
if not MY_CONFIG.GEMINI_API_KEY:
|
| 54 |
+
raise ValueError("GEMINI_API_KEY not set")
|
| 55 |
+
|
| 56 |
+
genai.configure(api_key=MY_CONFIG.GEMINI_API_KEY)
|
| 57 |
+
self.model_name = "gemini-1.5-flash"
|
| 58 |
+
self.gemini_model = genai.GenerativeModel(self.model_name)
|
| 59 |
+
logger.info("🆓 Using Google Gemini API")
|
| 60 |
+
|
| 61 |
+
else:
|
| 62 |
+
raise ValueError(f"Invalid provider '{llm_provider}'. Choose: cerebras, gemini")
|
| 63 |
+
|
| 64 |
+
# Initialize embedding model for DRIFT search metadata
|
| 65 |
+
try:
|
| 66 |
+
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
| 67 |
+
self.embedding_model = HuggingFaceEmbedding(
|
| 68 |
+
model_name=MY_CONFIG.EMBEDDING_MODEL
|
| 69 |
+
)
|
| 70 |
+
logger.info(f"🔍 Initialized embedding model: {MY_CONFIG.EMBEDDING_MODEL}")
|
| 71 |
+
except Exception as e:
|
| 72 |
+
logger.warning(f"⚠️ Embedding model initialization failed: {e}")
|
| 73 |
+
self.embedding_model = None
|
| 74 |
+
|
| 75 |
+
logger.info("✅ Phase 3 initialized: Community Summarization")
|
| 76 |
+
logger.info(f"📊 LLM Provider: {self.llm_provider.upper()}, Model: {self.model_name}")
|
| 77 |
+
|
| 78 |
+
# STEP 1: Load Phase 2 Output
|
| 79 |
+
def load_graph_data(self, input_path: str = None) -> bool:
|
| 80 |
+
"""Load graph-data-phase-2.json from Phase 2"""
|
| 81 |
+
if input_path is None:
|
| 82 |
+
input_path = "workspace/graph_data/graph-data-phase-2.json"
|
| 83 |
+
|
| 84 |
+
try:
|
| 85 |
+
input_file = Path(input_path)
|
| 86 |
+
if not input_file.exists():
|
| 87 |
+
logger.error(f"❌ Input file not found: {input_path}")
|
| 88 |
+
logger.error(" Please run Phase 2 (2b_process_graph_phase2.py) first")
|
| 89 |
+
return False
|
| 90 |
+
|
| 91 |
+
with open(input_file, 'r', encoding='utf-8') as f:
|
| 92 |
+
self.graph_data = json.load(f)
|
| 93 |
+
|
| 94 |
+
node_count = len(self.graph_data.get("nodes", []))
|
| 95 |
+
rel_count = len(self.graph_data.get("relationships", []))
|
| 96 |
+
|
| 97 |
+
# Verify Phase 2 was completed
|
| 98 |
+
if self.graph_data.get("metadata", {}).get("phase") != "community_detection":
|
| 99 |
+
logger.error("❌ Input file is not from Phase 2 (community_detection)")
|
| 100 |
+
return False
|
| 101 |
+
|
| 102 |
+
logger.info(f"📂 Loaded graph-data-phase-2.json: {node_count} nodes, {rel_count} relationships")
|
| 103 |
+
|
| 104 |
+
# Load community stats
|
| 105 |
+
self.community_stats = self.graph_data.get("community_stats", {})
|
| 106 |
+
num_communities = len(self.community_stats)
|
| 107 |
+
logger.info(f"📊 Found {num_communities} communities to summarize")
|
| 108 |
+
|
| 109 |
+
if num_communities == 0:
|
| 110 |
+
logger.error("❌ No communities found in Phase 2 output")
|
| 111 |
+
return False
|
| 112 |
+
|
| 113 |
+
return True
|
| 114 |
+
|
| 115 |
+
except Exception as e:
|
| 116 |
+
logger.error(f"❌ Error loading graph data: {e}")
|
| 117 |
+
return False
|
| 118 |
+
|
| 119 |
+
# STEP 2: Build NetworkX Graph
|
| 120 |
+
def _build_networkx_graph(self) -> nx.Graph:
|
| 121 |
+
"""Rebuild NetworkX graph from JSON data"""
|
| 122 |
+
logger.info("🔨 Building NetworkX graph from JSON data...")
|
| 123 |
+
|
| 124 |
+
G = nx.Graph()
|
| 125 |
+
|
| 126 |
+
# Add nodes with attributes
|
| 127 |
+
for node in self.graph_data["nodes"]:
|
| 128 |
+
node_id = node["id"]
|
| 129 |
+
properties = node.get("properties", {})
|
| 130 |
+
|
| 131 |
+
G.add_node(
|
| 132 |
+
node_id,
|
| 133 |
+
name=properties.get("name", ""),
|
| 134 |
+
type=node.get("labels", ["Unknown"])[0],
|
| 135 |
+
description=properties.get("content", ""),
|
| 136 |
+
community_id=properties.get("community_id", ""),
|
| 137 |
+
degree_centrality=properties.get("degree_centrality", 0.0)
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
# Add edges
|
| 141 |
+
for rel in self.graph_data["relationships"]:
|
| 142 |
+
start_node = rel.get("startNode")
|
| 143 |
+
end_node = rel.get("endNode")
|
| 144 |
+
|
| 145 |
+
if start_node in G.nodes() and end_node in G.nodes():
|
| 146 |
+
G.add_edge(start_node, end_node)
|
| 147 |
+
|
| 148 |
+
logger.info(f"✅ Built NetworkX graph: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")
|
| 149 |
+
|
| 150 |
+
return G
|
| 151 |
+
|
| 152 |
+
# STEP 3: Extract Community Assignments
|
| 153 |
+
def _extract_community_assignments(self) -> Dict[str, int]:
|
| 154 |
+
"""Extract community assignments from node properties"""
|
| 155 |
+
logger.info("📋 Extracting community assignments from nodes...")
|
| 156 |
+
|
| 157 |
+
assignments = {}
|
| 158 |
+
|
| 159 |
+
for node in self.graph_data["nodes"]:
|
| 160 |
+
node_id = node["id"]
|
| 161 |
+
comm_id_str = node.get("properties", {}).get("community_id", "")
|
| 162 |
+
|
| 163 |
+
if comm_id_str and comm_id_str.startswith("comm-"):
|
| 164 |
+
try:
|
| 165 |
+
comm_id = int(comm_id_str.replace("comm-", ""))
|
| 166 |
+
assignments[node_id] = comm_id
|
| 167 |
+
except ValueError:
|
| 168 |
+
logger.warning(f"Invalid community_id format: {comm_id_str}")
|
| 169 |
+
|
| 170 |
+
logger.info(f"✅ Extracted {len(assignments)} community assignments")
|
| 171 |
+
|
| 172 |
+
return assignments
|
| 173 |
+
|
| 174 |
+
# STEP 4: LLM Inference Methods
|
| 175 |
+
def _cerebras_inference(self, system_prompt: str, user_prompt: str) -> str:
|
| 176 |
+
"""Call Cerebras API for inference"""
|
| 177 |
+
try:
|
| 178 |
+
# Calculate dynamic parameters based on community size and complexity
|
| 179 |
+
total_nodes = self.nx_graph.number_of_nodes() if hasattr(self, 'nx_graph') else 100
|
| 180 |
+
complexity_factor = min(1.0, total_nodes / 1000)
|
| 181 |
+
|
| 182 |
+
# Adaptive temperature: higher for complex graphs to encourage creativity
|
| 183 |
+
dynamic_temperature = round(0.1 + (complexity_factor * 0.4), 2) # Range: 0.1-0.5
|
| 184 |
+
|
| 185 |
+
# Adaptive tokens: more for larger/complex summaries
|
| 186 |
+
dynamic_tokens = int(300 + (complexity_factor * 400)) # Range: 300-700
|
| 187 |
+
|
| 188 |
+
response = self.cerebras_client.chat.completions.create(
|
| 189 |
+
model=self.model_name,
|
| 190 |
+
messages=[
|
| 191 |
+
{"role": "system", "content": system_prompt},
|
| 192 |
+
{"role": "user", "content": user_prompt}
|
| 193 |
+
],
|
| 194 |
+
temperature=dynamic_temperature,
|
| 195 |
+
max_tokens=dynamic_tokens
|
| 196 |
+
)
|
| 197 |
+
|
| 198 |
+
if not response or not response.choices or not response.choices[0].message.content:
|
| 199 |
+
raise ValueError("Empty response from Cerebras")
|
| 200 |
+
|
| 201 |
+
return response.choices[0].message.content.strip()
|
| 202 |
+
|
| 203 |
+
except Exception as e:
|
| 204 |
+
logger.error(f"Cerebras inference error: {e}")
|
| 205 |
+
raise e
|
| 206 |
+
|
| 207 |
+
def _gemini_inference(self, system_prompt: str, user_prompt: str) -> str:
|
| 208 |
+
"""Call Gemini API for inference"""
|
| 209 |
+
try:
|
| 210 |
+
# Calculate dynamic generation config based on graph complexity
|
| 211 |
+
total_nodes = self.nx_graph.number_of_nodes() if hasattr(self, 'nx_graph') else 100
|
| 212 |
+
complexity_factor = min(1.0, total_nodes / 1000)
|
| 213 |
+
|
| 214 |
+
# Adaptive temperature and tokens for Gemini
|
| 215 |
+
dynamic_temperature = round(0.1 + (complexity_factor * 0.4), 2)
|
| 216 |
+
dynamic_tokens = int(300 + (complexity_factor * 400))
|
| 217 |
+
|
| 218 |
+
generation_config = {
|
| 219 |
+
"temperature": dynamic_temperature,
|
| 220 |
+
"max_output_tokens": dynamic_tokens,
|
| 221 |
+
"candidate_count": 1
|
| 222 |
+
}
|
| 223 |
+
|
| 224 |
+
combined_prompt = f"{system_prompt}\n\n{user_prompt}"
|
| 225 |
+
response = self.gemini_model.generate_content(
|
| 226 |
+
combined_prompt,
|
| 227 |
+
generation_config=generation_config
|
| 228 |
+
)
|
| 229 |
+
|
| 230 |
+
if not response or not response.text:
|
| 231 |
+
raise ValueError("Empty response from Gemini")
|
| 232 |
+
|
| 233 |
+
return response.text.strip()
|
| 234 |
+
|
| 235 |
+
except Exception as e:
|
| 236 |
+
logger.error(f"Gemini inference error: {e}")
|
| 237 |
+
raise e
|
| 238 |
+
|
| 239 |
+
# STEP 5: Generate Community Summaries
|
| 240 |
+
def _generate_community_summaries(self) -> Dict[int, str]:
|
| 241 |
+
"""Generate LLM summaries for each community"""
|
| 242 |
+
logger.info("📝 Generating community summaries with LLM...")
|
| 243 |
+
logger.info(f" Total communities to summarize: {len(self.community_stats)}")
|
| 244 |
+
|
| 245 |
+
summaries = {}
|
| 246 |
+
|
| 247 |
+
# Group nodes by community
|
| 248 |
+
communities = defaultdict(list)
|
| 249 |
+
for node_id, comm_id in self.community_assignments.items():
|
| 250 |
+
communities[comm_id].append(node_id)
|
| 251 |
+
|
| 252 |
+
start_time = time.time()
|
| 253 |
+
|
| 254 |
+
for idx, (comm_id_str, stats) in enumerate(self.community_stats.items(), 1):
|
| 255 |
+
comm_id = int(comm_id_str)
|
| 256 |
+
|
| 257 |
+
logger.info(f" Processing community {idx}/{len(self.community_stats)}: comm-{comm_id} ({stats['member_count']} members)")
|
| 258 |
+
|
| 259 |
+
# Get top entities by centrality
|
| 260 |
+
node_ids = communities[comm_id]
|
| 261 |
+
subgraph = self.nx_graph.subgraph(node_ids)
|
| 262 |
+
|
| 263 |
+
# Get nodes sorted by degree centrality
|
| 264 |
+
centrality = nx.degree_centrality(subgraph)
|
| 265 |
+
top_nodes = sorted(centrality.items(), key=lambda x: x[1], reverse=True)[:15]
|
| 266 |
+
|
| 267 |
+
# Prepare entity information for LLM
|
| 268 |
+
entity_info = []
|
| 269 |
+
for node_id, _ in top_nodes:
|
| 270 |
+
node_data = self.nx_graph.nodes[node_id]
|
| 271 |
+
entity_info.append({
|
| 272 |
+
"name": node_data.get("name", "Unknown"),
|
| 273 |
+
"type": node_data.get("type", "Unknown"),
|
| 274 |
+
"description": node_data.get("description", "")[:150] # Limit length
|
| 275 |
+
})
|
| 276 |
+
|
| 277 |
+
# Create LLM prompt
|
| 278 |
+
# Senior-developer style system/user prompts with strict output schema
|
| 279 |
+
# Calculate dynamic topic count based on community size
|
| 280 |
+
topic_count = max(2, min(5, stats['member_count'] // 3)) # Scale with community size
|
| 281 |
+
|
| 282 |
+
system_prompt = (
|
| 283 |
+
"You are a specialized knowledge graph summarization assistant. Your task is to analyze community "
|
| 284 |
+
"structures and generate comprehensive summaries for graph-based retrieval systems.\n\n"
|
| 285 |
+
"CONSTITUTIONAL AI PRINCIPLES:\n"
|
| 286 |
+
"1. Content-Adaptive: Generate summaries based on actual community composition and statistics\n"
|
| 287 |
+
"2. Context-Aware: Consider entity relationships and community density in summarization\n"
|
| 288 |
+
"3. Quality-First: Prioritize accuracy and relevance over brevity\n"
|
| 289 |
+
"4. Structured Output: Ensure consistent JSON format for programmatic consumption\n\n"
|
| 290 |
+
"SUMMARIZATION GUIDELINES:\n"
|
| 291 |
+
"- Analyze entity types, relationships, and community structure\n"
|
| 292 |
+
"- Identify key themes and concepts that define this community\n"
|
| 293 |
+
"- Generate topics that capture semantic meaning, not just entity names\n"
|
| 294 |
+
"- Assess confidence based on data completeness and coherence\n"
|
| 295 |
+
"- Use neutral, factual tone suitable for technical documentation"
|
| 296 |
+
)
|
| 297 |
+
|
| 298 |
+
user_prompt = (
|
| 299 |
+
f"Analyze the following community data and generate a structured summary.\n\n"
|
| 300 |
+
f"COMMUNITY STATISTICS:\n"
|
| 301 |
+
f"- Total Members: {stats['member_count']}\n"
|
| 302 |
+
f"- Internal Connections: {stats['internal_edges']}\n"
|
| 303 |
+
f"- Community Density: {stats['density']:.3f}\n"
|
| 304 |
+
f"- Connectivity Strength: {'High' if stats['density'] > 0.1 else 'Medium' if stats['density'] > 0.05 else 'Low'}\n\n"
|
| 305 |
+
f"TOP ENTITIES (name, type, description):\n{json.dumps(entity_info, indent=2)}\n\n"
|
| 306 |
+
f"OUTPUT FORMAT (strict JSON):\n"
|
| 307 |
+
f"{{\n"
|
| 308 |
+
f" \"summary\": \"2-3 sentence comprehensive summary of community purpose and characteristics\",\n"
|
| 309 |
+
f" \"primary_topics\": [\"topic_1\", \"topic_2\", \"topic_{topic_count}\"],\n"
|
| 310 |
+
f" \"confidence\": 0.85\n"
|
| 311 |
+
f"}}\n\n"
|
| 312 |
+
f"VALIDATION REQUIREMENTS:\n"
|
| 313 |
+
f"- summary: Must be 2-3 complete sentences describing community focus and key characteristics\n"
|
| 314 |
+
f"- primary_topics: Array of exactly {topic_count} descriptive phrases (not just entity names)\n"
|
| 315 |
+
f"- confidence: Float between 0.0-1.0 based on data quality and coherence\n\n"
|
| 316 |
+
f"IMPORTANT: Respond with ONLY the JSON object. No markdown formatting, no explanations, no code blocks."
|
| 317 |
+
)
|
| 318 |
+
|
| 319 |
+
# Call LLM for summary
|
| 320 |
+
try:
|
| 321 |
+
if self.llm_provider == "gemini":
|
| 322 |
+
summary_response = self._gemini_inference(system_prompt, user_prompt)
|
| 323 |
+
else: # cerebras
|
| 324 |
+
summary_response = self._cerebras_inference(system_prompt, user_prompt)
|
| 325 |
+
|
| 326 |
+
# Parse JSON response
|
| 327 |
+
parsed_summary = self._parse_summary_response(summary_response, comm_id)
|
| 328 |
+
if parsed_summary:
|
| 329 |
+
summaries[comm_id] = parsed_summary
|
| 330 |
+
else:
|
| 331 |
+
# Fallback to raw response if parsing fails
|
| 332 |
+
summaries[comm_id] = summary_response.strip()
|
| 333 |
+
|
| 334 |
+
# Log progress every 10 communities
|
| 335 |
+
if idx % 10 == 0:
|
| 336 |
+
elapsed = time.time() - start_time
|
| 337 |
+
avg_time = elapsed / idx
|
| 338 |
+
remaining = avg_time * (len(self.community_stats) - idx)
|
| 339 |
+
logger.info(f" Progress: {idx}/{len(self.community_stats)} ({elapsed:.1f}s elapsed, ~{remaining:.1f}s remaining)")
|
| 340 |
+
|
| 341 |
+
except Exception as e:
|
| 342 |
+
logger.error(f"❌ Failed to generate summary for community {comm_id}: {e}")
|
| 343 |
+
summaries[comm_id] = f"Community with {stats['member_count']} entities focused on {entity_info[0]['type'] if entity_info else 'various'} topics."
|
| 344 |
+
|
| 345 |
+
elapsed = time.time() - start_time
|
| 346 |
+
logger.info(f"✅ Generated {len(summaries)} community summaries in {elapsed:.1f}s")
|
| 347 |
+
|
| 348 |
+
return summaries
|
| 349 |
+
|
| 350 |
+
def _parse_summary_response(self, response: str, comm_id: int) -> str:
|
| 351 |
+
"""Parse JSON summary response with fallback to text extraction"""
|
| 352 |
+
try:
|
| 353 |
+
# Clean response
|
| 354 |
+
cleaned_response = response.strip()
|
| 355 |
+
|
| 356 |
+
# Remove markdown formatting
|
| 357 |
+
if "```json" in cleaned_response:
|
| 358 |
+
parts = cleaned_response.split("```json")
|
| 359 |
+
if len(parts) > 1:
|
| 360 |
+
json_part = parts[1].split("```")[0].strip()
|
| 361 |
+
cleaned_response = json_part
|
| 362 |
+
elif "```" in cleaned_response:
|
| 363 |
+
parts = cleaned_response.split("```")
|
| 364 |
+
if len(parts) >= 3:
|
| 365 |
+
cleaned_response = parts[1].strip()
|
| 366 |
+
|
| 367 |
+
# Try to parse JSON
|
| 368 |
+
try:
|
| 369 |
+
summary_data = self._smart_json_parse_summary(cleaned_response)
|
| 370 |
+
if summary_data and isinstance(summary_data, dict):
|
| 371 |
+
summary_text = summary_data.get('summary', '')
|
| 372 |
+
if summary_text and len(summary_text.strip()) > 10:
|
| 373 |
+
return summary_text.strip()
|
| 374 |
+
except ValueError as e:
|
| 375 |
+
logger.debug(f"Summary JSON parsing failed for comm-{comm_id}: {e}")
|
| 376 |
+
except Exception as e:
|
| 377 |
+
logger.debug(f"Summary JSON parsing unexpected error for comm-{comm_id}: {e}")
|
| 378 |
+
|
| 379 |
+
except Exception as e:
|
| 380 |
+
logger.debug(f"Summary JSON parsing failed for comm-{comm_id}: {e}")
|
| 381 |
+
|
| 382 |
+
# Fallback: extract first meaningful sentence
|
| 383 |
+
try:
|
| 384 |
+
lines = response.split('\n')
|
| 385 |
+
for line in lines:
|
| 386 |
+
line = line.strip()
|
| 387 |
+
if len(line) > 20 and '.' in line and not line.startswith('{'):
|
| 388 |
+
return line
|
| 389 |
+
except Exception:
|
| 390 |
+
pass
|
| 391 |
+
|
| 392 |
+
return None
|
| 393 |
+
|
| 394 |
+
def _smart_json_parse_summary(self, json_text: str) -> Dict:
|
| 395 |
+
"""
|
| 396 |
+
Simple 5-step JSON parsing approach (exactly same as Phase 1)
|
| 397 |
+
"""
|
| 398 |
+
cleaned_text = json_text.strip()
|
| 399 |
+
|
| 400 |
+
# Step 1: orjson
|
| 401 |
+
try:
|
| 402 |
+
result = orjson.loads(cleaned_text.encode('utf-8'))
|
| 403 |
+
logger.debug("✅ Step 1: orjson succeeded")
|
| 404 |
+
return result
|
| 405 |
+
except Exception as e:
|
| 406 |
+
logger.debug(f"❌ Step 1: orjson failed - {e}")
|
| 407 |
+
|
| 408 |
+
# Step 2: json-repair
|
| 409 |
+
try:
|
| 410 |
+
repaired = repair_json(cleaned_text)
|
| 411 |
+
result = orjson.loads(repaired.encode('utf-8'))
|
| 412 |
+
logger.debug("✅ Step 2: json-repair + orjson succeeded")
|
| 413 |
+
return result
|
| 414 |
+
except Exception as e:
|
| 415 |
+
logger.debug(f"❌ Step 2: json-repair failed - {e}")
|
| 416 |
+
|
| 417 |
+
# Step 3: standard json
|
| 418 |
+
try:
|
| 419 |
+
result = json.loads(cleaned_text)
|
| 420 |
+
logger.debug("✅ Step 3: standard json succeeded")
|
| 421 |
+
return result
|
| 422 |
+
except Exception as e:
|
| 423 |
+
logger.debug(f"❌ Step 3: standard json failed - {e}")
|
| 424 |
+
|
| 425 |
+
# Step 4: json-repair + standard json
|
| 426 |
+
try:
|
| 427 |
+
repaired = repair_json(cleaned_text)
|
| 428 |
+
result = json.loads(repaired)
|
| 429 |
+
logger.debug("✅ Step 4: json-repair + standard json succeeded")
|
| 430 |
+
return result
|
| 431 |
+
except Exception as e:
|
| 432 |
+
logger.debug(f"❌ Step 4: json-repair + standard json failed - {e}")
|
| 433 |
+
|
| 434 |
+
# Step 5: All failed - this will trigger save failed txt files
|
| 435 |
+
raise ValueError("All 4 JSON parsing steps failed")
|
| 436 |
+
|
| 437 |
+
# STEP 6: Identify Key Entities
|
| 438 |
+
def _identify_key_entities(self) -> Dict[int, List[str]]:
|
| 439 |
+
"""Identify key entities in each community based on centrality"""
|
| 440 |
+
logger.info("🔑 Identifying key entities per community...")
|
| 441 |
+
|
| 442 |
+
key_entities = {}
|
| 443 |
+
|
| 444 |
+
# Group nodes by community
|
| 445 |
+
communities = defaultdict(list)
|
| 446 |
+
for node_id, comm_id in self.community_assignments.items():
|
| 447 |
+
communities[comm_id].append(node_id)
|
| 448 |
+
|
| 449 |
+
for comm_id, node_ids in communities.items():
|
| 450 |
+
subgraph = self.nx_graph.subgraph(node_ids)
|
| 451 |
+
|
| 452 |
+
# Calculate degree centrality
|
| 453 |
+
centrality = nx.degree_centrality(subgraph)
|
| 454 |
+
|
| 455 |
+
# Get top 5 entities
|
| 456 |
+
top_nodes = sorted(centrality.items(), key=lambda x: x[1], reverse=True)[:5]
|
| 457 |
+
|
| 458 |
+
key_entities[comm_id] = [
|
| 459 |
+
self.nx_graph.nodes[node_id].get("name", "Unknown")
|
| 460 |
+
for node_id, _ in top_nodes
|
| 461 |
+
]
|
| 462 |
+
|
| 463 |
+
logger.info(f"✅ Identified key entities for {len(key_entities)} communities")
|
| 464 |
+
|
| 465 |
+
return key_entities
|
| 466 |
+
|
| 467 |
+
# STEP 7: Create Community Nodes
|
| 468 |
+
def _create_community_nodes(self, community_summaries: Dict[int, str], key_entities: Dict[int, List[str]]) -> List[Dict]:
|
| 469 |
+
"""Create community nodes for the graph"""
|
| 470 |
+
logger.info("🏗️ Creating community nodes...")
|
| 471 |
+
|
| 472 |
+
import uuid
|
| 473 |
+
|
| 474 |
+
community_nodes = []
|
| 475 |
+
|
| 476 |
+
for comm_id_str, stats in self.community_stats.items():
|
| 477 |
+
comm_id = int(comm_id_str)
|
| 478 |
+
|
| 479 |
+
node = {
|
| 480 |
+
"id": f"community-{uuid.uuid4()}",
|
| 481 |
+
"elementId": f"community-{uuid.uuid4()}",
|
| 482 |
+
"labels": ["Community"],
|
| 483 |
+
"properties": {
|
| 484 |
+
"community_id": f"comm-{comm_id}",
|
| 485 |
+
"level": 1,
|
| 486 |
+
"member_count": stats["member_count"],
|
| 487 |
+
"internal_edges": stats["internal_edges"],
|
| 488 |
+
"density": round(stats["density"], 4),
|
| 489 |
+
"avg_degree": round(stats["avg_degree"], 2),
|
| 490 |
+
"summary": community_summaries.get(comm_id, ""),
|
| 491 |
+
"key_entities": key_entities.get(comm_id, []),
|
| 492 |
+
"created_date": datetime.now().isoformat()
|
| 493 |
+
}
|
| 494 |
+
}
|
| 495 |
+
community_nodes.append(node)
|
| 496 |
+
|
| 497 |
+
logger.info(f"✅ Created {len(community_nodes)} community nodes")
|
| 498 |
+
|
| 499 |
+
return community_nodes
|
| 500 |
+
|
| 501 |
+
# STEP 8: Create IN_COMMUNITY Relationships
|
| 502 |
+
def _create_in_community_relationships(self, community_nodes: List[Dict]) -> List[Dict]:
|
| 503 |
+
"""Create IN_COMMUNITY relationships linking entities to communities"""
|
| 504 |
+
logger.info("Creating IN_COMMUNITY relationships...")
|
| 505 |
+
|
| 506 |
+
import uuid
|
| 507 |
+
|
| 508 |
+
# Create mapping from community_id to community node id
|
| 509 |
+
comm_id_to_node_id = {}
|
| 510 |
+
for node in community_nodes:
|
| 511 |
+
comm_id = node["properties"]["community_id"]
|
| 512 |
+
comm_id_to_node_id[comm_id] = node["id"]
|
| 513 |
+
|
| 514 |
+
relationships = []
|
| 515 |
+
|
| 516 |
+
for entity_id, comm_id in self.community_assignments.items():
|
| 517 |
+
comm_node_id = comm_id_to_node_id.get(f"comm-{comm_id}")
|
| 518 |
+
|
| 519 |
+
if comm_node_id:
|
| 520 |
+
# Calculate confidence based on community membership strength
|
| 521 |
+
entity_node = next((n for n in self.graph_data['nodes'] if n['id'] == entity_id), None)
|
| 522 |
+
if entity_node:
|
| 523 |
+
degree_centrality = entity_node.get('properties', {}).get('degree_centrality', 0.5)
|
| 524 |
+
# Higher centrality = higher confidence in community assignment
|
| 525 |
+
dynamic_confidence = round(0.6 + (degree_centrality * 0.4), 3) # Range: 0.6-1.0
|
| 526 |
+
else:
|
| 527 |
+
dynamic_confidence = 0.8 # Default for missing nodes
|
| 528 |
+
|
| 529 |
+
rel = {
|
| 530 |
+
"id": f"rel-{uuid.uuid4()}",
|
| 531 |
+
"startNode": entity_id,
|
| 532 |
+
"endNode": comm_node_id,
|
| 533 |
+
"type": "IN_COMMUNITY",
|
| 534 |
+
"properties": {
|
| 535 |
+
"confidence": dynamic_confidence,
|
| 536 |
+
"assigned_date": datetime.now().isoformat()
|
| 537 |
+
}
|
| 538 |
+
}
|
| 539 |
+
relationships.append(rel)
|
| 540 |
+
|
| 541 |
+
logger.info(f"✅ Created {len(relationships)} IN_COMMUNITY relationships")
|
| 542 |
+
|
| 543 |
+
return relationships
|
| 544 |
+
|
| 545 |
+
# STEP 9: DRIFT Search Metadata Generation
|
| 546 |
+
def _generate_drift_metadata(self, community_summaries: Dict[int, str], key_entities: Dict[int, List[str]]) -> Dict:
|
| 547 |
+
"""Generate DRIFT search metadata using existing embedding infrastructure"""
|
| 548 |
+
logger.info("🔍 Generating DRIFT search metadata...")
|
| 549 |
+
|
| 550 |
+
if not self.embedding_model:
|
| 551 |
+
logger.warning("⚠️ Embedding model not available, skipping DRIFT metadata")
|
| 552 |
+
return {}
|
| 553 |
+
|
| 554 |
+
# Calculate dynamic values from actual graph data
|
| 555 |
+
total_communities = len(community_summaries)
|
| 556 |
+
total_nodes = self.nx_graph.number_of_nodes()
|
| 557 |
+
total_edges = self.nx_graph.number_of_edges()
|
| 558 |
+
avg_community_size = sum(self.community_stats.get(str(i), {}).get("member_count", 0)
|
| 559 |
+
for i in community_summaries.keys()) / total_communities if total_communities > 0 else 0
|
| 560 |
+
graph_density = total_edges / (total_nodes * (total_nodes - 1) / 2) if total_nodes > 1 else 0
|
| 561 |
+
|
| 562 |
+
# Calculate dynamic thresholds based on graph complexity
|
| 563 |
+
complexity_factor = min(1.0, (total_nodes + total_edges) / 10000) # Scale 0-1 based on graph size
|
| 564 |
+
base_confidence = 0.6 + (complexity_factor * 0.3) # Range: 0.6-0.9
|
| 565 |
+
base_response_time = 1.0 + (complexity_factor * 3.0) # Range: 1-4 seconds
|
| 566 |
+
base_memory = int(20 + (avg_community_size * complexity_factor * 5)) # Scale with size
|
| 567 |
+
|
| 568 |
+
# Adaptive configuration based on graph characteristics
|
| 569 |
+
max_communities_for_primer = min(total_communities, max(2, total_communities // 4))
|
| 570 |
+
lightweight_communities = max(1, max_communities_for_primer // 2)
|
| 571 |
+
standard_communities = max(2, int(max_communities_for_primer // 1.5))
|
| 572 |
+
comprehensive_communities = max_communities_for_primer
|
| 573 |
+
|
| 574 |
+
# Calculate dynamic iteration counts based on community distribution
|
| 575 |
+
max_iter = max(2, min(5, int(total_communities / 10) + 2))
|
| 576 |
+
hyde_count = max(2, min(5, int(avg_community_size / 5) + 2))
|
| 577 |
+
|
| 578 |
+
drift_metadata = {
|
| 579 |
+
"version": "1.0",
|
| 580 |
+
"generated_timestamp": datetime.now().isoformat(),
|
| 581 |
+
"configuration": {
|
| 582 |
+
"max_iterations": max_iter,
|
| 583 |
+
"confidence_threshold": round(base_confidence + 0.1, 2),
|
| 584 |
+
"top_k_communities": max_communities_for_primer,
|
| 585 |
+
"hyde_expansion_count": hyde_count,
|
| 586 |
+
"termination_criteria": "confidence_or_max_iterations"
|
| 587 |
+
},
|
| 588 |
+
"query_routing_config": {
|
| 589 |
+
"lightweight_drift": {
|
| 590 |
+
"triggers": ["single_entity", "simple_fact", "definition_query"],
|
| 591 |
+
"config": {
|
| 592 |
+
"primer_communities": int(lightweight_communities),
|
| 593 |
+
"follow_up_iterations": max(1, max_iter - 2),
|
| 594 |
+
"confidence_threshold": round(base_confidence, 2)
|
| 595 |
+
}
|
| 596 |
+
},
|
| 597 |
+
"standard_drift": {
|
| 598 |
+
"triggers": ["multi_entity", "relationship_query", "how_does"],
|
| 599 |
+
"config": {
|
| 600 |
+
"primer_communities": int(standard_communities),
|
| 601 |
+
"follow_up_iterations": max(1, max_iter - 1),
|
| 602 |
+
"confidence_threshold": round(base_confidence + 0.1, 2)
|
| 603 |
+
}
|
| 604 |
+
},
|
| 605 |
+
"comprehensive_drift": {
|
| 606 |
+
"triggers": ["analyze", "compare", "implications", "strategy"],
|
| 607 |
+
"config": {
|
| 608 |
+
"primer_communities": int(comprehensive_communities),
|
| 609 |
+
"follow_up_iterations": max_iter,
|
| 610 |
+
"confidence_threshold": round(base_confidence + 0.2, 2)
|
| 611 |
+
}
|
| 612 |
+
}
|
| 613 |
+
},
|
| 614 |
+
"performance_monitoring": {
|
| 615 |
+
"response_time_targets": {
|
| 616 |
+
"p50": round(base_response_time * 1.0, 1),
|
| 617 |
+
"p95": round(base_response_time * 2.5, 1),
|
| 618 |
+
"p99": round(base_response_time * 5.0, 1)
|
| 619 |
+
},
|
| 620 |
+
"resource_tracking": {
|
| 621 |
+
"memory_per_query": base_memory,
|
| 622 |
+
"cache_hit_rate_target": round(0.5 + (complexity_factor * 0.3), 2)
|
| 623 |
+
},
|
| 624 |
+
"bottleneck_identification": ["community_ranking", "follow_up_generation", "embedding_computation"]
|
| 625 |
+
},
|
| 626 |
+
"community_search_index": {},
|
| 627 |
+
"search_optimization": {
|
| 628 |
+
"total_communities": total_communities,
|
| 629 |
+
"avg_community_size": round(avg_community_size, 1),
|
| 630 |
+
"graph_density": round(graph_density, 6),
|
| 631 |
+
"total_nodes": total_nodes,
|
| 632 |
+
"total_edges": total_edges,
|
| 633 |
+
"max_primer_communities": max_communities_for_primer
|
| 634 |
+
}
|
| 635 |
+
}
|
| 636 |
+
|
| 637 |
+
# Process each community
|
| 638 |
+
for comm_id, summary in community_summaries.items():
|
| 639 |
+
comm_key = f"comm-{comm_id}"
|
| 640 |
+
|
| 641 |
+
try:
|
| 642 |
+
# Generate embeddings using existing HuggingFace model
|
| 643 |
+
summary_embedding = self.embedding_model.get_text_embedding(summary)
|
| 644 |
+
hyde_embeddings = self._generate_hyde_embeddings(summary)
|
| 645 |
+
follow_up_questions = self._generate_follow_up_questions(summary, comm_id, key_entities.get(comm_id, []))
|
| 646 |
+
|
| 647 |
+
# Add to search index
|
| 648 |
+
drift_metadata["community_search_index"][comm_key] = {
|
| 649 |
+
"summary": summary,
|
| 650 |
+
"key_entities": key_entities.get(comm_id, []),
|
| 651 |
+
"embeddings": {
|
| 652 |
+
"summary_embedding": summary_embedding,
|
| 653 |
+
"hyde_embeddings": hyde_embeddings
|
| 654 |
+
},
|
| 655 |
+
"follow_up_templates": follow_up_questions,
|
| 656 |
+
"statistics": self.community_stats.get(str(comm_id), {})
|
| 657 |
+
}
|
| 658 |
+
|
| 659 |
+
except Exception as e:
|
| 660 |
+
logger.warning(f"⚠️ Failed to generate metadata for {comm_key}: {e}")
|
| 661 |
+
continue
|
| 662 |
+
|
| 663 |
+
logger.info(f"✅ Generated DRIFT metadata for {len(drift_metadata['community_search_index'])} communities")
|
| 664 |
+
return drift_metadata
|
| 665 |
+
|
| 666 |
+
def _generate_hyde_embeddings(self, community_summary: str) -> List[List[float]]:
|
| 667 |
+
"""Generate HyDE embeddings for enhanced recall"""
|
| 668 |
+
|
| 669 |
+
# Create 3 hypothetical document variations
|
| 670 |
+
hyde_templates = [
|
| 671 |
+
f"Research analysis and findings: {community_summary}",
|
| 672 |
+
f"Technical report and documentation: {community_summary}",
|
| 673 |
+
f"Business implications and strategic analysis: {community_summary}"
|
| 674 |
+
]
|
| 675 |
+
|
| 676 |
+
hyde_embeddings = []
|
| 677 |
+
for template in hyde_templates:
|
| 678 |
+
try:
|
| 679 |
+
embedding = self.embedding_model.get_text_embedding(template)
|
| 680 |
+
hyde_embeddings.append(embedding)
|
| 681 |
+
except Exception as e:
|
| 682 |
+
logger.warning(f"⚠️ HyDE embedding generation failed: {e}")
|
| 683 |
+
continue
|
| 684 |
+
|
| 685 |
+
return hyde_embeddings
|
| 686 |
+
|
| 687 |
+
def _generate_follow_up_questions(self, community_summary: str, comm_id: int, key_entities: List[str]) -> List[Dict]:
|
| 688 |
+
"""Generate follow-up questions using existing LLM infrastructure"""
|
| 689 |
+
|
| 690 |
+
# Professional system prompt matching Phase 1 style
|
| 691 |
+
system_prompt = (
|
| 692 |
+
"You are a specialized DRIFT search question generation assistant. Your task is to analyze community "
|
| 693 |
+
"summaries and generate targeted follow-up questions for iterative knowledge graph exploration.\n\n"
|
| 694 |
+
"CONSTITUTIONAL AI PRINCIPLES:\n"
|
| 695 |
+
"1. Context-Adaptive: Generate questions based on actual community content and entities\n"
|
| 696 |
+
"2. Search-Aware: Choose appropriate search types to guide query routing optimization\n"
|
| 697 |
+
"3. Relevance-First: Prioritize questions that expand understanding of community themes\n"
|
| 698 |
+
"4. Structured Output: Ensure consistent JSON format for programmatic consumption\n\n"
|
| 699 |
+
"QUESTION GENERATION GUIDELINES:\n"
|
| 700 |
+
"- Analyze community summary and key entities to identify knowledge gaps\n"
|
| 701 |
+
"- Generate questions that would reveal additional relevant information\n"
|
| 702 |
+
"- Use local search for entity-specific queries, relationship for connections, global for themes\n"
|
| 703 |
+
"- Assign relevance scores based on potential value for understanding the community\n"
|
| 704 |
+
"- Target entities should guide search focus and retrieval optimization"
|
| 705 |
+
)
|
| 706 |
+
|
| 707 |
+
user_prompt = (
|
| 708 |
+
f"Analyze the following community data and generate targeted follow-up questions.\n\n"
|
| 709 |
+
f"COMMUNITY SUMMARY:\n{community_summary}\n\n"
|
| 710 |
+
f"KEY ENTITIES: {', '.join(key_entities[:5]) if key_entities else 'No specific entities identified'}\n\n"
|
| 711 |
+
f"TASK: Generate exactly 3 strategic follow-up questions for DRIFT search.\n\n"
|
| 712 |
+
f"OUTPUT FORMAT (strict JSON):\n"
|
| 713 |
+
f"[\n"
|
| 714 |
+
f" {{\n"
|
| 715 |
+
f" \"question\": \"Specific, actionable question about the community\",\n"
|
| 716 |
+
f" \"relevance_score\": 0.85,\n"
|
| 717 |
+
f" \"search_type\": \"local\",\n"
|
| 718 |
+
f" \"target_entities\": [\"entity1\", \"entity2\"]\n"
|
| 719 |
+
f" }}\n"
|
| 720 |
+
f"]\n\n"
|
| 721 |
+
f"VALIDATION REQUIREMENTS:\n"
|
| 722 |
+
f"- question: Must be a clear, specific question that expands community understanding\n"
|
| 723 |
+
f"- relevance_score: Float 0.0-1.0 based on potential value for knowledge expansion\n"
|
| 724 |
+
f"- search_type: Must be one of 'local', 'relationship', or 'global'\n"
|
| 725 |
+
f"- target_entities: Array of relevant entity names from the key entities list\n\n"
|
| 726 |
+
f"IMPORTANT: Respond with ONLY the JSON array. No markdown formatting, no explanations, no code blocks."
|
| 727 |
+
)
|
| 728 |
+
|
| 729 |
+
try:
|
| 730 |
+
# Use existing LLM infrastructure
|
| 731 |
+
if self.llm_provider == "cerebras":
|
| 732 |
+
response = self._cerebras_inference(system_prompt, user_prompt)
|
| 733 |
+
else:
|
| 734 |
+
response = self._gemini_inference(system_prompt, user_prompt)
|
| 735 |
+
|
| 736 |
+
# Parse LLM response to structured questions
|
| 737 |
+
questions = self._parse_questions_response(response, key_entities)
|
| 738 |
+
return questions
|
| 739 |
+
|
| 740 |
+
except Exception as e:
|
| 741 |
+
logger.error(f"❌ Question generation failed for comm-{comm_id}: {e}")
|
| 742 |
+
return []
|
| 743 |
+
|
| 744 |
+
def _parse_questions_response(self, response: str, key_entities: List[str]) -> List[Dict]:
|
| 745 |
+
"""Parse LLM response into structured questions using robust multi-strategy approach"""
|
| 746 |
+
try:
|
| 747 |
+
# Calculate dynamic default relevance based on community statistics
|
| 748 |
+
total_nodes = self.nx_graph.number_of_nodes() if hasattr(self, 'nx_graph') else 100
|
| 749 |
+
node_density = min(1.0, total_nodes / 500) # Scale 0-1
|
| 750 |
+
default_relevance = round(0.5 + (node_density * 0.4), 2) # Range: 0.5-0.9
|
| 751 |
+
max_questions = max(2, min(5, len(key_entities) + 1)) # Adaptive question count
|
| 752 |
+
|
| 753 |
+
# Strategy 1: JSON array extraction with regex
|
| 754 |
+
try:
|
| 755 |
+
import re
|
| 756 |
+
match = re.search(r"(\[\s*\{[\s\S]*?\}\s*\])", response)
|
| 757 |
+
if match:
|
| 758 |
+
json_str = match.group(1)
|
| 759 |
+
try:
|
| 760 |
+
questions = self._smart_json_parse_questions(json_str)
|
| 761 |
+
if questions:
|
| 762 |
+
return self._validate_and_normalize_questions(questions, key_entities, default_relevance, max_questions)
|
| 763 |
+
except ValueError:
|
| 764 |
+
pass # Continue to next strategy if JSON parsing fails
|
| 765 |
+
except Exception:
|
| 766 |
+
pass
|
| 767 |
+
|
| 768 |
+
# Strategy 2: Multiple JSON objects extraction
|
| 769 |
+
try:
|
| 770 |
+
import re
|
| 771 |
+
pattern = r'\{[^{}]*"question"[^{}]*\}'
|
| 772 |
+
matches = re.findall(pattern, response)
|
| 773 |
+
if matches:
|
| 774 |
+
json_array = "[" + ",".join(matches) + "]"
|
| 775 |
+
try:
|
| 776 |
+
questions = self._smart_json_parse_questions(json_array)
|
| 777 |
+
if questions:
|
| 778 |
+
return self._validate_and_normalize_questions(questions, key_entities, default_relevance, max_questions)
|
| 779 |
+
except ValueError:
|
| 780 |
+
pass # Continue to next strategy if JSON parsing fails
|
| 781 |
+
except Exception:
|
| 782 |
+
pass
|
| 783 |
+
|
| 784 |
+
# Strategy 3: Markdown list extraction
|
| 785 |
+
try:
|
| 786 |
+
questions = self._parse_markdown_questions(response, key_entities, default_relevance)
|
| 787 |
+
if questions:
|
| 788 |
+
return self._validate_and_normalize_questions(questions, key_entities, default_relevance, max_questions)
|
| 789 |
+
except Exception:
|
| 790 |
+
pass
|
| 791 |
+
|
| 792 |
+
# Strategy 4: Generate default questions based on entities
|
| 793 |
+
return self._generate_default_questions(key_entities, default_relevance, max_questions)
|
| 794 |
+
|
| 795 |
+
except Exception as e:
|
| 796 |
+
logger.warning(f"⚠️ All question parsing strategies failed: {e}")
|
| 797 |
+
return self._generate_default_questions(key_entities, 0.7, 3)
|
| 798 |
+
|
| 799 |
+
def _smart_json_parse_questions(self, json_text: str) -> List[Dict]:
|
| 800 |
+
"""
|
| 801 |
+
Simple 5-step JSON parsing approach (exactly same as Phase 1)
|
| 802 |
+
"""
|
| 803 |
+
cleaned_text = json_text.strip()
|
| 804 |
+
|
| 805 |
+
# Step 1: orjson
|
| 806 |
+
try:
|
| 807 |
+
result = orjson.loads(cleaned_text.encode('utf-8'))
|
| 808 |
+
logger.debug("✅ Step 1: orjson succeeded")
|
| 809 |
+
return result
|
| 810 |
+
except Exception as e:
|
| 811 |
+
logger.debug(f"❌ Step 1: orjson failed - {e}")
|
| 812 |
+
|
| 813 |
+
# Step 2: json-repair
|
| 814 |
+
try:
|
| 815 |
+
repaired = repair_json(cleaned_text)
|
| 816 |
+
result = orjson.loads(repaired.encode('utf-8'))
|
| 817 |
+
logger.debug("✅ Step 2: json-repair + orjson succeeded")
|
| 818 |
+
return result
|
| 819 |
+
except Exception as e:
|
| 820 |
+
logger.debug(f"❌ Step 2: json-repair failed - {e}")
|
| 821 |
+
|
| 822 |
+
# Step 3: standard json
|
| 823 |
+
try:
|
| 824 |
+
result = json.loads(cleaned_text)
|
| 825 |
+
logger.debug("✅ Step 3: standard json succeeded")
|
| 826 |
+
return result
|
| 827 |
+
except Exception as e:
|
| 828 |
+
logger.debug(f"❌ Step 3: standard json failed - {e}")
|
| 829 |
+
|
| 830 |
+
# Step 4: json-repair + standard json
|
| 831 |
+
try:
|
| 832 |
+
repaired = repair_json(cleaned_text)
|
| 833 |
+
result = json.loads(repaired)
|
| 834 |
+
logger.debug("✅ Step 4: json-repair + standard json succeeded")
|
| 835 |
+
return result
|
| 836 |
+
except Exception as e:
|
| 837 |
+
logger.debug(f"❌ Step 4: json-repair + standard json failed - {e}")
|
| 838 |
+
|
| 839 |
+
# Step 5: All failed - this will trigger save failed txt files
|
| 840 |
+
raise ValueError("All 4 JSON parsing steps failed")
|
| 841 |
+
|
| 842 |
+
def _parse_markdown_questions(self, response: str, key_entities: List[str], default_relevance: float) -> List[Dict]:
|
| 843 |
+
"""Parse questions from markdown or plain text format"""
|
| 844 |
+
questions = []
|
| 845 |
+
|
| 846 |
+
# Look for numbered lists or bullet points
|
| 847 |
+
import re
|
| 848 |
+
patterns = [
|
| 849 |
+
r'\d+\.\s*(.+?)(?=\n\d+\.|\n-|\n\*|$)', # Numbered list
|
| 850 |
+
r'-\s*(.+?)(?=\n-|\n\*|\n\d+\.|$)', # Dash list
|
| 851 |
+
r'\*\s*(.+?)(?=\n\*|\n-|\n\d+\.|$)' # Asterisk list
|
| 852 |
+
]
|
| 853 |
+
|
| 854 |
+
for pattern in patterns:
|
| 855 |
+
matches = re.findall(pattern, response, re.MULTILINE | re.DOTALL)
|
| 856 |
+
if matches and len(matches) >= 2:
|
| 857 |
+
for i, match in enumerate(matches[:5]): # Max 5 questions
|
| 858 |
+
question_text = match.strip().replace('\n', ' ')
|
| 859 |
+
if len(question_text) > 10: # Reasonable question length
|
| 860 |
+
search_type = 'global' if any(word in question_text.lower()
|
| 861 |
+
for word in ['analyze', 'compare', 'overall', 'trends']) else 'local'
|
| 862 |
+
questions.append({
|
| 863 |
+
'question': question_text,
|
| 864 |
+
'relevance_score': max(0.6, default_relevance - (i * 0.1)),
|
| 865 |
+
'search_type': search_type,
|
| 866 |
+
'target_entities': key_entities[:2] if key_entities else []
|
| 867 |
+
})
|
| 868 |
+
break
|
| 869 |
+
|
| 870 |
+
return questions
|
| 871 |
+
|
| 872 |
+
def _generate_default_questions(self, key_entities: List[str], default_relevance: float, max_questions: int) -> List[Dict]:
|
| 873 |
+
"""Generate default questions when parsing fails"""
|
| 874 |
+
if not key_entities:
|
| 875 |
+
return []
|
| 876 |
+
|
| 877 |
+
# Template questions based on entity analysis
|
| 878 |
+
question_templates = [
|
| 879 |
+
("What is {entity} and what role does it play?", "local"),
|
| 880 |
+
("How does {entity} relate to other entities in this community?", "relationship"),
|
| 881 |
+
("What are the key characteristics and properties of {entity}?", "local"),
|
| 882 |
+
("What trends or patterns involve {entity}?", "global"),
|
| 883 |
+
("How might {entity} impact the broader context?", "global")
|
| 884 |
+
]
|
| 885 |
+
|
| 886 |
+
questions = []
|
| 887 |
+
entities_to_use = key_entities[:max_questions]
|
| 888 |
+
|
| 889 |
+
for i, entity in enumerate(entities_to_use):
|
| 890 |
+
if i < len(question_templates):
|
| 891 |
+
template, search_type = question_templates[i]
|
| 892 |
+
question = template.format(entity=entity)
|
| 893 |
+
questions.append({
|
| 894 |
+
'question': question,
|
| 895 |
+
'relevance_score': max(0.6, default_relevance - (i * 0.05)),
|
| 896 |
+
'search_type': search_type,
|
| 897 |
+
'target_entities': [entity]
|
| 898 |
+
})
|
| 899 |
+
|
| 900 |
+
return questions
|
| 901 |
+
|
| 902 |
+
def _validate_and_normalize_questions(self, questions: List[Dict], key_entities: List[str],
|
| 903 |
+
default_relevance: float, max_questions: int) -> List[Dict]:
|
| 904 |
+
"""Validate and normalize question format"""
|
| 905 |
+
normalized = []
|
| 906 |
+
|
| 907 |
+
for q in questions:
|
| 908 |
+
if not isinstance(q, dict):
|
| 909 |
+
continue
|
| 910 |
+
|
| 911 |
+
# Extract question text
|
| 912 |
+
question = q.get('question') or q.get('q') or q.get('text')
|
| 913 |
+
if not question or len(str(question).strip()) < 5:
|
| 914 |
+
continue
|
| 915 |
+
|
| 916 |
+
# Extract and validate relevance score
|
| 917 |
+
relevance = q.get('relevance_score', default_relevance)
|
| 918 |
+
try:
|
| 919 |
+
relevance = float(relevance)
|
| 920 |
+
if relevance <= 0 or relevance > 1:
|
| 921 |
+
relevance = default_relevance
|
| 922 |
+
except (ValueError, TypeError):
|
| 923 |
+
relevance = default_relevance
|
| 924 |
+
|
| 925 |
+
# Extract and validate search type
|
| 926 |
+
search_type = q.get('search_type', 'local')
|
| 927 |
+
if search_type not in ('local', 'relationship', 'global'):
|
| 928 |
+
search_type = 'local'
|
| 929 |
+
|
| 930 |
+
# Extract target entities
|
| 931 |
+
target_entities = q.get('target_entities', [])
|
| 932 |
+
if not isinstance(target_entities, list):
|
| 933 |
+
target_entities = []
|
| 934 |
+
|
| 935 |
+
# Ensure we have some target entities
|
| 936 |
+
if not target_entities and key_entities:
|
| 937 |
+
target_entities = key_entities[:2]
|
| 938 |
+
|
| 939 |
+
normalized.append({
|
| 940 |
+
'question': str(question).strip(),
|
| 941 |
+
'relevance_score': round(relevance, 2),
|
| 942 |
+
'search_type': search_type,
|
| 943 |
+
'target_entities': target_entities
|
| 944 |
+
})
|
| 945 |
+
|
| 946 |
+
if len(normalized) >= max_questions:
|
| 947 |
+
break
|
| 948 |
+
|
| 949 |
+
return normalized
|
| 950 |
+
|
| 951 |
+
# STEP 10: Main Processing Entry Point
|
| 952 |
+
def generate_summaries(self, input_path: str = None, output_path: str = None) -> bool:
|
| 953 |
+
"""Main entry point for Phase 3"""
|
| 954 |
+
if output_path is None:
|
| 955 |
+
output_path = "workspace/graph_data/graph-data-final.json"
|
| 956 |
+
|
| 957 |
+
logger.info("🚀 Starting Phase 3: Community Summarization")
|
| 958 |
+
logger.info("=" * 60)
|
| 959 |
+
|
| 960 |
+
start_time = time.time()
|
| 961 |
+
|
| 962 |
+
# Step 1: Load Phase 2 output
|
| 963 |
+
if not self.load_graph_data(input_path):
|
| 964 |
+
return False
|
| 965 |
+
|
| 966 |
+
# Step 2: Build NetworkX graph
|
| 967 |
+
self.nx_graph = self._build_networkx_graph()
|
| 968 |
+
|
| 969 |
+
# Step 3: Extract community assignments
|
| 970 |
+
self.community_assignments = self._extract_community_assignments()
|
| 971 |
+
|
| 972 |
+
# Step 4: Generate LLM summaries
|
| 973 |
+
community_summaries = self._generate_community_summaries()
|
| 974 |
+
|
| 975 |
+
# Step 5: Identify key entities
|
| 976 |
+
key_entities = self._identify_key_entities()
|
| 977 |
+
|
| 978 |
+
# Step 6: Create community nodes
|
| 979 |
+
community_nodes = self._create_community_nodes(community_summaries, key_entities)
|
| 980 |
+
|
| 981 |
+
# Step 7: Create IN_COMMUNITY relationships
|
| 982 |
+
community_relationships = self._create_in_community_relationships(community_nodes)
|
| 983 |
+
|
| 984 |
+
# Step 8: Merge everything
|
| 985 |
+
self.graph_data["nodes"].extend(community_nodes)
|
| 986 |
+
self.graph_data["relationships"].extend(community_relationships)
|
| 987 |
+
|
| 988 |
+
# Step 9: Add communities section
|
| 989 |
+
self.graph_data["communities"] = {
|
| 990 |
+
"algorithm": "Leiden",
|
| 991 |
+
"total_communities": len(community_summaries),
|
| 992 |
+
"modularity_score": self.graph_data["metadata"]["community_detection"]["modularity_score"],
|
| 993 |
+
"summaries": {
|
| 994 |
+
f"comm-{k}": v for k, v in community_summaries.items()
|
| 995 |
+
}
|
| 996 |
+
}
|
| 997 |
+
|
| 998 |
+
# Step 10: Generate DRIFT search metadata
|
| 999 |
+
drift_metadata = self._generate_drift_metadata(community_summaries, key_entities)
|
| 1000 |
+
if drift_metadata:
|
| 1001 |
+
self.graph_data["drift_search_metadata"] = drift_metadata
|
| 1002 |
+
logger.info("✅ Added DRIFT search metadata to graph data")
|
| 1003 |
+
|
| 1004 |
+
# Step 11: Clean up temporary data
|
| 1005 |
+
if "community_stats" in self.graph_data:
|
| 1006 |
+
del self.graph_data["community_stats"]
|
| 1007 |
+
|
| 1008 |
+
# Step 12: Update metadata
|
| 1009 |
+
self.graph_data["metadata"]["phase"] = "final"
|
| 1010 |
+
self.graph_data["metadata"]["entity_count"] = len([n for n in self.graph_data["nodes"] if "Community" not in n["labels"]])
|
| 1011 |
+
self.graph_data["metadata"]["community_count"] = len(community_nodes)
|
| 1012 |
+
self.graph_data["metadata"]["total_node_count"] = len(self.graph_data["nodes"])
|
| 1013 |
+
self.graph_data["metadata"]["total_relationship_count"] = len(self.graph_data["relationships"])
|
| 1014 |
+
|
| 1015 |
+
# Step 13: Save final output
|
| 1016 |
+
if self._save_final_output(output_path):
|
| 1017 |
+
elapsed = time.time() - start_time
|
| 1018 |
+
logger.info("=" * 60)
|
| 1019 |
+
logger.info(f"✅ Phase 3 completed successfully in {elapsed:.1f}s")
|
| 1020 |
+
logger.info("📊 Final stats:")
|
| 1021 |
+
logger.info(f" - Total nodes: {len(self.graph_data['nodes'])}")
|
| 1022 |
+
logger.info(f" - Entity nodes: {self.graph_data['metadata']['entity_count']}")
|
| 1023 |
+
logger.info(f" - Community nodes: {len(community_nodes)}")
|
| 1024 |
+
logger.info(f" - Total relationships: {len(self.graph_data['relationships'])}")
|
| 1025 |
+
logger.info(f" - Communities with summaries: {len(community_summaries)}")
|
| 1026 |
+
logger.info(f" - Output saved to: {output_path}")
|
| 1027 |
+
return True
|
| 1028 |
+
else:
|
| 1029 |
+
return False
|
| 1030 |
+
|
| 1031 |
+
# STEP 14: Save Final Output
|
| 1032 |
+
def _save_final_output(self, output_path: str) -> bool:
|
| 1033 |
+
"""Save graph-data-final.json with DRIFT search metadata"""
|
| 1034 |
+
try:
|
| 1035 |
+
# Ensure output directory exists
|
| 1036 |
+
output_dir = Path(output_path).parent
|
| 1037 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 1038 |
+
|
| 1039 |
+
# Save final output
|
| 1040 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
| 1041 |
+
json.dump(self.graph_data, f, indent=2, ensure_ascii=False)
|
| 1042 |
+
|
| 1043 |
+
# Calculate file size
|
| 1044 |
+
output_size = os.path.getsize(output_path)
|
| 1045 |
+
output_size_mb = output_size / (1024 * 1024)
|
| 1046 |
+
|
| 1047 |
+
logger.info(f"💾 Saved final output: {output_path} ({output_size_mb:.2f} MB)")
|
| 1048 |
+
|
| 1049 |
+
return True
|
| 1050 |
+
|
| 1051 |
+
except Exception as e:
|
| 1052 |
+
logger.error(f"❌ Error saving final output: {e}")
|
| 1053 |
+
return False
|
| 1054 |
+
|
| 1055 |
+
|
| 1056 |
+
# STEP 15: Main Entry Point
|
| 1057 |
+
def main():
|
| 1058 |
+
"""Main function to run Phase 3: Community Summarization with DRIFT Search Metadata"""
|
| 1059 |
+
logger.info("🚀 GraphRAG Phase 3: Community Summarization + DRIFT Search Metadata")
|
| 1060 |
+
logger.info(" Input: graph-data-phase-2.json (from Phase 2)")
|
| 1061 |
+
logger.info(" Output: graph-data-final.json (with DRIFT search metadata)")
|
| 1062 |
+
logger.info("")
|
| 1063 |
+
|
| 1064 |
+
# Choose LLM provider from environment or default to cerebras
|
| 1065 |
+
llm_provider = os.getenv("GRAPH_LLM_PROVIDER", "cerebras").lower()
|
| 1066 |
+
logger.info(f" Using LLM provider: {llm_provider.upper()}")
|
| 1067 |
+
|
| 1068 |
+
try:
|
| 1069 |
+
# Initialize Phase 3 processor
|
| 1070 |
+
processor = GraphBuilderPhase3(llm_provider=llm_provider)
|
| 1071 |
+
|
| 1072 |
+
# Generate summaries
|
| 1073 |
+
success = processor.generate_summaries()
|
| 1074 |
+
|
| 1075 |
+
if success:
|
| 1076 |
+
logger.info("")
|
| 1077 |
+
logger.info("✅ Phase 3 completed successfully!")
|
| 1078 |
+
logger.info("� DRIFT search metadata generated and included")
|
| 1079 |
+
logger.info("�📋 Next step: Upload to Neo4j using 3b_save_to_graph_db.py")
|
| 1080 |
+
logger.info(" The graph-data-final.json is now ready for Neo4j import with DRIFT capabilities")
|
| 1081 |
+
return 0
|
| 1082 |
+
else:
|
| 1083 |
+
logger.error("")
|
| 1084 |
+
logger.error("❌ Phase 3 failed")
|
| 1085 |
+
logger.error(" Please check the logs above for details")
|
| 1086 |
+
return 1
|
| 1087 |
+
|
| 1088 |
+
except Exception as e:
|
| 1089 |
+
logger.error(f"❌ Phase 3 pipeline failed: {e}")
|
| 1090 |
+
import traceback
|
| 1091 |
+
logger.error(traceback.format_exc())
|
| 1092 |
+
return 1
|
| 1093 |
+
|
| 1094 |
+
|
| 1095 |
+
if __name__ == "__main__":
|
| 1096 |
+
exit(main())
|
3_save_to_vector_db.ipynb
ADDED
|
@@ -0,0 +1,329 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"metadata": {},
|
| 6 |
+
"source": [
|
| 7 |
+
"# Save Markdown text into Vector DB"
|
| 8 |
+
]
|
| 9 |
+
},
|
| 10 |
+
{
|
| 11 |
+
"cell_type": "markdown",
|
| 12 |
+
"metadata": {},
|
| 13 |
+
"source": [
|
| 14 |
+
"## Step-1: Config"
|
| 15 |
+
]
|
| 16 |
+
},
|
| 17 |
+
{
|
| 18 |
+
"cell_type": "code",
|
| 19 |
+
"execution_count": 1,
|
| 20 |
+
"metadata": {},
|
| 21 |
+
"outputs": [],
|
| 22 |
+
"source": [
|
| 23 |
+
"from my_config import MY_CONFIG"
|
| 24 |
+
]
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"cell_type": "markdown",
|
| 28 |
+
"metadata": {},
|
| 29 |
+
"source": [
|
| 30 |
+
"## Step-2: Read Markdown"
|
| 31 |
+
]
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"cell_type": "code",
|
| 35 |
+
"execution_count": 2,
|
| 36 |
+
"metadata": {},
|
| 37 |
+
"outputs": [],
|
| 38 |
+
"source": [
|
| 39 |
+
"import os\n",
|
| 40 |
+
"import glob\n",
|
| 41 |
+
"\n",
|
| 42 |
+
"pattern = os.path.join(MY_CONFIG.PROCESSED_DATA_DIR, '*.md')\n",
|
| 43 |
+
"md_file_count = len(glob.glob(pattern, recursive=True)) "
|
| 44 |
+
]
|
| 45 |
+
},
|
| 46 |
+
{
|
| 47 |
+
"cell_type": "code",
|
| 48 |
+
"execution_count": 3,
|
| 49 |
+
"metadata": {},
|
| 50 |
+
"outputs": [
|
| 51 |
+
{
|
| 52 |
+
"name": "stdout",
|
| 53 |
+
"output_type": "stream",
|
| 54 |
+
"text": [
|
| 55 |
+
"Loaded 96 documents from 96 files\n"
|
| 56 |
+
]
|
| 57 |
+
}
|
| 58 |
+
],
|
| 59 |
+
"source": [
|
| 60 |
+
"from llama_index.core import SimpleDirectoryReader\n",
|
| 61 |
+
"\n",
|
| 62 |
+
"reader = SimpleDirectoryReader(input_dir=MY_CONFIG.PROCESSED_DATA_DIR, recursive=False , required_exts=[\".md\"])\n",
|
| 63 |
+
"documents = reader.load_data()\n",
|
| 64 |
+
"\n",
|
| 65 |
+
"print (f\"Loaded {len(documents)} documents from {md_file_count} files\")\n"
|
| 66 |
+
]
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"cell_type": "code",
|
| 70 |
+
"execution_count": 4,
|
| 71 |
+
"metadata": {},
|
| 72 |
+
"outputs": [
|
| 73 |
+
{
|
| 74 |
+
"name": "stdout",
|
| 75 |
+
"output_type": "stream",
|
| 76 |
+
"text": [
|
| 77 |
+
"Doc ID: 20eef2cd-ee21-4dd4-baf6-eda09d5d793b\n",
|
| 78 |
+
"Text: # Building the open future of AI We are technology developers,\n",
|
| 79 |
+
"researchers, industry leaders and advocates who collaborate to advance\n",
|
| 80 |
+
"safe, responsible AI rooted in open innovation.  "
|
| 90 |
+
]
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"cell_type": "markdown",
|
| 94 |
+
"metadata": {},
|
| 95 |
+
"source": [
|
| 96 |
+
"## Step-3: Create Chunks"
|
| 97 |
+
]
|
| 98 |
+
},
|
| 99 |
+
{
|
| 100 |
+
"cell_type": "code",
|
| 101 |
+
"execution_count": 5,
|
| 102 |
+
"metadata": {},
|
| 103 |
+
"outputs": [
|
| 104 |
+
{
|
| 105 |
+
"name": "stdout",
|
| 106 |
+
"output_type": "stream",
|
| 107 |
+
"text": [
|
| 108 |
+
"Created 223 chunks from 96 documents\n"
|
| 109 |
+
]
|
| 110 |
+
}
|
| 111 |
+
],
|
| 112 |
+
"source": [
|
| 113 |
+
"from llama_index.core import Document\n",
|
| 114 |
+
"from llama_index.core.node_parser import SentenceSplitter\n",
|
| 115 |
+
"\n",
|
| 116 |
+
"parser = SentenceSplitter(chunk_size=MY_CONFIG.CHUNK_SIZE, chunk_overlap=MY_CONFIG.CHUNK_OVERLAP)\n",
|
| 117 |
+
"nodes = parser.get_nodes_from_documents(documents)\n",
|
| 118 |
+
"print(f\"Created {len(nodes)} chunks from {len(documents)} documents\")"
|
| 119 |
+
]
|
| 120 |
+
},
|
| 121 |
+
{
|
| 122 |
+
"cell_type": "markdown",
|
| 123 |
+
"metadata": {},
|
| 124 |
+
"source": [
|
| 125 |
+
"## Step-4: Setup Embedding Model"
|
| 126 |
+
]
|
| 127 |
+
},
|
| 128 |
+
{
|
| 129 |
+
"cell_type": "code",
|
| 130 |
+
"execution_count": 6,
|
| 131 |
+
"metadata": {},
|
| 132 |
+
"outputs": [],
|
| 133 |
+
"source": [
|
| 134 |
+
"# If connection to https://huggingface.co/ failed, uncomment the following path\n",
|
| 135 |
+
"import os\n",
|
| 136 |
+
"os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'"
|
| 137 |
+
]
|
| 138 |
+
},
|
| 139 |
+
{
|
| 140 |
+
"cell_type": "code",
|
| 141 |
+
"execution_count": 7,
|
| 142 |
+
"metadata": {},
|
| 143 |
+
"outputs": [
|
| 144 |
+
{
|
| 145 |
+
"name": "stderr",
|
| 146 |
+
"output_type": "stream",
|
| 147 |
+
"text": [
|
| 148 |
+
"/home/sujee/apps/anaconda3/envs/allycat-6/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
| 149 |
+
" from .autonotebook import tqdm as notebook_tqdm\n"
|
| 150 |
+
]
|
| 151 |
+
}
|
| 152 |
+
],
|
| 153 |
+
"source": [
|
| 154 |
+
"from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n",
|
| 155 |
+
"from llama_index.core import Settings\n",
|
| 156 |
+
"\n",
|
| 157 |
+
"Settings.embed_model = HuggingFaceEmbedding(\n",
|
| 158 |
+
" model_name = MY_CONFIG.EMBEDDING_MODEL\n",
|
| 159 |
+
")"
|
| 160 |
+
]
|
| 161 |
+
},
|
| 162 |
+
{
|
| 163 |
+
"cell_type": "markdown",
|
| 164 |
+
"metadata": {},
|
| 165 |
+
"source": [
|
| 166 |
+
"## Step-5: Connect to Milvus"
|
| 167 |
+
]
|
| 168 |
+
},
|
| 169 |
+
{
|
| 170 |
+
"cell_type": "code",
|
| 171 |
+
"execution_count": 8,
|
| 172 |
+
"metadata": {},
|
| 173 |
+
"outputs": [
|
| 174 |
+
{
|
| 175 |
+
"name": "stdout",
|
| 176 |
+
"output_type": "stream",
|
| 177 |
+
"text": [
|
| 178 |
+
"✅ Connected to Milvus instance: workspace/rag_website_milvus.db\n"
|
| 179 |
+
]
|
| 180 |
+
}
|
| 181 |
+
],
|
| 182 |
+
"source": [
|
| 183 |
+
"## Clear up any old data\n",
|
| 184 |
+
"\n",
|
| 185 |
+
"from pymilvus import MilvusClient\n",
|
| 186 |
+
"\n",
|
| 187 |
+
"milvus_client = MilvusClient(MY_CONFIG.DB_URI)\n",
|
| 188 |
+
"print (\"✅ Connected to Milvus instance: \", MY_CONFIG.DB_URI )\n",
|
| 189 |
+
"\n",
|
| 190 |
+
"# if we already have a collection, clear it first\n",
|
| 191 |
+
"if milvus_client.has_collection(collection_name = MY_CONFIG.COLLECTION_NAME):\n",
|
| 192 |
+
" milvus_client.drop_collection(collection_name = MY_CONFIG.COLLECTION_NAME)\n",
|
| 193 |
+
" print ('✅ Cleared collection :', MY_CONFIG.COLLECTION_NAME)\n",
|
| 194 |
+
" "
|
| 195 |
+
]
|
| 196 |
+
},
|
| 197 |
+
{
|
| 198 |
+
"cell_type": "code",
|
| 199 |
+
"execution_count": 9,
|
| 200 |
+
"metadata": {},
|
| 201 |
+
"outputs": [
|
| 202 |
+
{
|
| 203 |
+
"name": "stderr",
|
| 204 |
+
"output_type": "stream",
|
| 205 |
+
"text": [
|
| 206 |
+
"2025-05-12 23:36:12,218 [DEBUG][_create_connection]: Created new connection using: f81ea0e5320b44f7b5ba8b89f6aa43f7 (async_milvus_client.py:600)\n"
|
| 207 |
+
]
|
| 208 |
+
},
|
| 209 |
+
{
|
| 210 |
+
"name": "stdout",
|
| 211 |
+
"output_type": "stream",
|
| 212 |
+
"text": [
|
| 213 |
+
"✅ Connected Llama-index to Milvus instance: workspace/rag_website_milvus.db\n"
|
| 214 |
+
]
|
| 215 |
+
}
|
| 216 |
+
],
|
| 217 |
+
"source": [
|
| 218 |
+
"# connect llama-index to vector db\n",
|
| 219 |
+
"\n",
|
| 220 |
+
"from llama_index.core import StorageContext\n",
|
| 221 |
+
"from llama_index.vector_stores.milvus import MilvusVectorStore\n",
|
| 222 |
+
"\n",
|
| 223 |
+
"vector_store = MilvusVectorStore(\n",
|
| 224 |
+
" uri = MY_CONFIG.DB_URI ,\n",
|
| 225 |
+
" dim = MY_CONFIG.EMBEDDING_LENGTH , \n",
|
| 226 |
+
" collection_name = MY_CONFIG.COLLECTION_NAME,\n",
|
| 227 |
+
" overwrite=True\n",
|
| 228 |
+
")\n",
|
| 229 |
+
"storage_context = StorageContext.from_defaults(vector_store=vector_store)\n",
|
| 230 |
+
"\n",
|
| 231 |
+
"print (\"✅ Connected Llama-index to Milvus instance: \", MY_CONFIG.DB_URI )"
|
| 232 |
+
]
|
| 233 |
+
},
|
| 234 |
+
{
|
| 235 |
+
"cell_type": "markdown",
|
| 236 |
+
"metadata": {},
|
| 237 |
+
"source": [
|
| 238 |
+
"## Step-6: Save to DB"
|
| 239 |
+
]
|
| 240 |
+
},
|
| 241 |
+
{
|
| 242 |
+
"cell_type": "code",
|
| 243 |
+
"execution_count": 10,
|
| 244 |
+
"metadata": {},
|
| 245 |
+
"outputs": [
|
| 246 |
+
{
|
| 247 |
+
"name": "stdout",
|
| 248 |
+
"output_type": "stream",
|
| 249 |
+
"text": [
|
| 250 |
+
"CPU times: user 9 μs, sys: 0 ns, total: 9 μs\n",
|
| 251 |
+
"Wall time: 18.8 μs\n"
|
| 252 |
+
]
|
| 253 |
+
}
|
| 254 |
+
],
|
| 255 |
+
"source": [
|
| 256 |
+
"%%time\n",
|
| 257 |
+
"\n",
|
| 258 |
+
"## We save entire md documents into vector store\n",
|
| 259 |
+
"\n",
|
| 260 |
+
"# from llama_index.core import VectorStoreIndex\n",
|
| 261 |
+
"\n",
|
| 262 |
+
"# index = VectorStoreIndex.from_documents(\n",
|
| 263 |
+
"# documents, storage_context=storage_context\n",
|
| 264 |
+
"# )\n",
|
| 265 |
+
"# print (f\"✅ Saved {len(documents)} documents to db: {MY_CONFIG.DB_URI}\" )"
|
| 266 |
+
]
|
| 267 |
+
},
|
| 268 |
+
{
|
| 269 |
+
"cell_type": "code",
|
| 270 |
+
"execution_count": 11,
|
| 271 |
+
"metadata": {},
|
| 272 |
+
"outputs": [
|
| 273 |
+
{
|
| 274 |
+
"name": "stdout",
|
| 275 |
+
"output_type": "stream",
|
| 276 |
+
"text": [
|
| 277 |
+
"Successfully stored 223 chunks in Milvus collection 'pages'\n",
|
| 278 |
+
"CPU times: user 900 ms, sys: 142 ms, total: 1.04 s\n",
|
| 279 |
+
"Wall time: 807 ms\n"
|
| 280 |
+
]
|
| 281 |
+
}
|
| 282 |
+
],
|
| 283 |
+
"source": [
|
| 284 |
+
"%%time \n",
|
| 285 |
+
"\n",
|
| 286 |
+
"# save chunks into vector db\n",
|
| 287 |
+
"\n",
|
| 288 |
+
"from llama_index.core import VectorStoreIndex\n",
|
| 289 |
+
"\n",
|
| 290 |
+
"index = VectorStoreIndex(\n",
|
| 291 |
+
" nodes=nodes,\n",
|
| 292 |
+
" storage_context=storage_context,\n",
|
| 293 |
+
" )\n",
|
| 294 |
+
"\n",
|
| 295 |
+
"print(f\"Successfully stored {len(nodes)} chunks in Milvus collection '{MY_CONFIG.COLLECTION_NAME}'\")\n"
|
| 296 |
+
]
|
| 297 |
+
},
|
| 298 |
+
{
|
| 299 |
+
"cell_type": "code",
|
| 300 |
+
"execution_count": null,
|
| 301 |
+
"metadata": {},
|
| 302 |
+
"outputs": [],
|
| 303 |
+
"source": [
|
| 304 |
+
"milvus_client.close()"
|
| 305 |
+
]
|
| 306 |
+
}
|
| 307 |
+
],
|
| 308 |
+
"metadata": {
|
| 309 |
+
"kernelspec": {
|
| 310 |
+
"display_name": "allycat-6",
|
| 311 |
+
"language": "python",
|
| 312 |
+
"name": "python3"
|
| 313 |
+
},
|
| 314 |
+
"language_info": {
|
| 315 |
+
"codemirror_mode": {
|
| 316 |
+
"name": "ipython",
|
| 317 |
+
"version": 3
|
| 318 |
+
},
|
| 319 |
+
"file_extension": ".py",
|
| 320 |
+
"mimetype": "text/x-python",
|
| 321 |
+
"name": "python",
|
| 322 |
+
"nbconvert_exporter": "python",
|
| 323 |
+
"pygments_lexer": "ipython3",
|
| 324 |
+
"version": "3.11.11"
|
| 325 |
+
}
|
| 326 |
+
},
|
| 327 |
+
"nbformat": 4,
|
| 328 |
+
"nbformat_minor": 2
|
| 329 |
+
}
|
3_save_to_vector_db.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from my_config import MY_CONFIG
|
| 2 |
+
import os
|
| 3 |
+
import glob
|
| 4 |
+
from llama_index.core import SimpleDirectoryReader
|
| 5 |
+
from llama_index.core.node_parser import SentenceSplitter
|
| 6 |
+
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
| 7 |
+
from llama_index.core import Settings
|
| 8 |
+
from pymilvus import MilvusClient
|
| 9 |
+
from llama_index.core import StorageContext
|
| 10 |
+
from llama_index.vector_stores.milvus import MilvusVectorStore
|
| 11 |
+
from llama_index.core import VectorStoreIndex
|
| 12 |
+
import logging
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
# logger.setLevel(logging.INFO)
|
| 18 |
+
|
| 19 |
+
# Step-1: Read Markdown files
|
| 20 |
+
pattern = os.path.join(MY_CONFIG.PROCESSED_DATA_DIR, '*.md')
|
| 21 |
+
md_file_count = len(glob.glob(pattern, recursive=True))
|
| 22 |
+
|
| 23 |
+
reader = SimpleDirectoryReader(input_dir=MY_CONFIG.PROCESSED_DATA_DIR, recursive=False, required_exts=[".md"])
|
| 24 |
+
documents = reader.load_data()
|
| 25 |
+
logger.info (f"Loaded {len(documents)} documents from {md_file_count} files")
|
| 26 |
+
|
| 27 |
+
# Step-2: Create Chunks
|
| 28 |
+
parser = SentenceSplitter(chunk_size=MY_CONFIG.CHUNK_SIZE, chunk_overlap=MY_CONFIG.CHUNK_OVERLAP)
|
| 29 |
+
nodes = parser.get_nodes_from_documents(documents)
|
| 30 |
+
logger.info (f"Created {len(nodes)} chunks from {len(documents)} documents")
|
| 31 |
+
|
| 32 |
+
# Step-3: Setup Embedding Model
|
| 33 |
+
os.environ['HF_ENDPOINT'] = MY_CONFIG.HF_ENDPOINT
|
| 34 |
+
|
| 35 |
+
Settings.embed_model = HuggingFaceEmbedding(
|
| 36 |
+
model_name = MY_CONFIG.EMBEDDING_MODEL
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
# Step-4: Create 2 Vector Databases (Vector RAG and Hybrid GraphRAG databases)
|
| 40 |
+
|
| 41 |
+
databases_to_create = [
|
| 42 |
+
{
|
| 43 |
+
"name": "Vector RAG Only",
|
| 44 |
+
"uri": MY_CONFIG.MILVUS_URI_VECTOR,
|
| 45 |
+
"description": "For Vector RAG systems"
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"name": "Hybrid GraphRAG",
|
| 49 |
+
"uri": MY_CONFIG.MILVUS_URI_HYBRID_GRAPH,
|
| 50 |
+
"description": "For Hybrid GraphRAG systems"
|
| 51 |
+
}
|
| 52 |
+
]
|
| 53 |
+
|
| 54 |
+
for db_config in databases_to_create:
|
| 55 |
+
logger.info(f"📦 Creating {db_config['name']} database...")
|
| 56 |
+
|
| 57 |
+
# Connect to Milvus for this database
|
| 58 |
+
milvus_client = MilvusClient(db_config['uri'])
|
| 59 |
+
logger.info(f"✅ Connected to: {db_config['uri']}")
|
| 60 |
+
|
| 61 |
+
if milvus_client.has_collection(collection_name = MY_CONFIG.COLLECTION_NAME):
|
| 62 |
+
milvus_client.drop_collection(collection_name = MY_CONFIG.COLLECTION_NAME)
|
| 63 |
+
logger.info(f"✅ Cleared collection: {MY_CONFIG.COLLECTION_NAME}")
|
| 64 |
+
|
| 65 |
+
# Connect llama-index to vector db
|
| 66 |
+
vector_store = MilvusVectorStore(
|
| 67 |
+
uri = db_config['uri'],
|
| 68 |
+
dim = MY_CONFIG.EMBEDDING_LENGTH,
|
| 69 |
+
collection_name = MY_CONFIG.COLLECTION_NAME,
|
| 70 |
+
overwrite=True
|
| 71 |
+
)
|
| 72 |
+
storage_context = StorageContext.from_defaults(vector_store=vector_store)
|
| 73 |
+
|
| 74 |
+
# Save chunks into vector db
|
| 75 |
+
index = VectorStoreIndex(
|
| 76 |
+
nodes=nodes,
|
| 77 |
+
storage_context=storage_context,
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
logger.info(f"✅ Stored {len(nodes)} chunks in {db_config['name']}")
|
| 81 |
+
milvus_client.close()
|
| 82 |
+
|
| 83 |
+
logger.info("🎉 Both databases created!")
|
| 84 |
+
logger.info(f" • Vector RAG: {MY_CONFIG.MILVUS_URI_VECTOR}")
|
| 85 |
+
logger.info(f" • Hybrid GraphRAG: {MY_CONFIG.MILVUS_URI_HYBRID_GRAPH}")
|
3_save_to_vector_db_zilliz.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Cloud Vector Database Setup
|
| 3 |
+
|
| 4 |
+
Creates vector database collections on cloud infrastructure.
|
| 5 |
+
Supports both vector search and graph-based retrieval systems.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from my_config import MY_CONFIG
|
| 9 |
+
import os
|
| 10 |
+
import sys
|
| 11 |
+
from llama_index.core import SimpleDirectoryReader
|
| 12 |
+
from llama_index.core.node_parser import SentenceSplitter
|
| 13 |
+
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
| 14 |
+
from llama_index.core import Settings
|
| 15 |
+
from pymilvus import MilvusClient
|
| 16 |
+
from llama_index.core import StorageContext
|
| 17 |
+
from llama_index.vector_stores.milvus import MilvusVectorStore
|
| 18 |
+
from llama_index.core import VectorStoreIndex
|
| 19 |
+
import logging
|
| 20 |
+
|
| 21 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 22 |
+
logger = logging.getLogger(__name__)
|
| 23 |
+
|
| 24 |
+
# Validate cloud database configuration
|
| 25 |
+
if not MY_CONFIG.ZILLIZ_CLUSTER_ENDPOINT:
|
| 26 |
+
raise ValueError("Cloud endpoint configuration missing")
|
| 27 |
+
if not MY_CONFIG.ZILLIZ_TOKEN:
|
| 28 |
+
raise ValueError("Cloud authentication token missing")
|
| 29 |
+
|
| 30 |
+
def main():
|
| 31 |
+
logger.info("Initializing cloud database connection")
|
| 32 |
+
|
| 33 |
+
# Load source documents
|
| 34 |
+
logger.info("Loading documents")
|
| 35 |
+
reader = SimpleDirectoryReader(input_dir=MY_CONFIG.PROCESSED_DATA_DIR, recursive=False, required_exts=[".md"])
|
| 36 |
+
documents = reader.load_data()
|
| 37 |
+
logger.info(f"Loaded {len(documents)} documents")
|
| 38 |
+
|
| 39 |
+
# Process document chunks
|
| 40 |
+
logger.info("Processing document chunks")
|
| 41 |
+
parser = SentenceSplitter(chunk_size=MY_CONFIG.CHUNK_SIZE, chunk_overlap=MY_CONFIG.CHUNK_OVERLAP)
|
| 42 |
+
nodes = parser.get_nodes_from_documents(documents)
|
| 43 |
+
logger.info(f"Created {len(nodes)} chunks")
|
| 44 |
+
|
| 45 |
+
# Initialize embedding model
|
| 46 |
+
logger.info("Configuring embedding model")
|
| 47 |
+
os.environ['HF_ENDPOINT'] = MY_CONFIG.HF_ENDPOINT
|
| 48 |
+
|
| 49 |
+
Settings.embed_model = HuggingFaceEmbedding(
|
| 50 |
+
model_name=MY_CONFIG.EMBEDDING_MODEL
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
# Create cloud database collection
|
| 54 |
+
logger.info("Creating database collection")
|
| 55 |
+
collection_name = MY_CONFIG.COLLECTION_NAME
|
| 56 |
+
|
| 57 |
+
milvus_client = None
|
| 58 |
+
try:
|
| 59 |
+
# Connect to cloud database
|
| 60 |
+
milvus_client = MilvusClient(
|
| 61 |
+
uri=MY_CONFIG.ZILLIZ_CLUSTER_ENDPOINT,
|
| 62 |
+
token=MY_CONFIG.ZILLIZ_TOKEN
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
# Remove existing collection if present
|
| 66 |
+
if milvus_client.has_collection(collection_name=collection_name):
|
| 67 |
+
milvus_client.drop_collection(collection_name=collection_name)
|
| 68 |
+
|
| 69 |
+
# Initialize vector store
|
| 70 |
+
vector_store = MilvusVectorStore(
|
| 71 |
+
uri=MY_CONFIG.ZILLIZ_CLUSTER_ENDPOINT,
|
| 72 |
+
token=MY_CONFIG.ZILLIZ_TOKEN,
|
| 73 |
+
collection_name=collection_name,
|
| 74 |
+
dim=MY_CONFIG.EMBEDDING_LENGTH,
|
| 75 |
+
overwrite=True
|
| 76 |
+
)
|
| 77 |
+
storage_context = StorageContext.from_defaults(vector_store=vector_store)
|
| 78 |
+
|
| 79 |
+
# Store document vectors
|
| 80 |
+
logger.info(f"Processing {len(nodes)} document chunks")
|
| 81 |
+
VectorStoreIndex(
|
| 82 |
+
nodes=nodes,
|
| 83 |
+
storage_context=storage_context,
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
logger.info(f"Database collection '{collection_name}' created successfully")
|
| 87 |
+
|
| 88 |
+
except Exception as e:
|
| 89 |
+
logger.error(f"Failed to create collection: {str(e)}")
|
| 90 |
+
raise
|
| 91 |
+
finally:
|
| 92 |
+
if milvus_client:
|
| 93 |
+
milvus_client.close()
|
| 94 |
+
|
| 95 |
+
logger.info("Cloud database setup completed successfully")
|
| 96 |
+
|
| 97 |
+
if __name__ == "__main__":
|
| 98 |
+
try:
|
| 99 |
+
main()
|
| 100 |
+
sys.exit(0)
|
| 101 |
+
except KeyboardInterrupt:
|
| 102 |
+
logger.info("Operation cancelled by user")
|
| 103 |
+
sys.exit(1)
|
| 104 |
+
except Exception as e:
|
| 105 |
+
logger.error(f"Fatal error: {str(e)}")
|
| 106 |
+
sys.exit(1)
|
3b_save_to_graph_db.py
ADDED
|
@@ -0,0 +1,1050 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import json
|
| 3 |
+
import logging
|
| 4 |
+
import os
|
| 5 |
+
import sys
|
| 6 |
+
from typing import Any, Dict, Optional
|
| 7 |
+
from my_config import MY_CONFIG
|
| 8 |
+
from neo4j import GraphDatabase, Driver
|
| 9 |
+
from tqdm import tqdm
|
| 10 |
+
from fastmcp import FastMCP
|
| 11 |
+
|
| 12 |
+
logging.basicConfig(level=logging.INFO)
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
|
| 15 |
+
GRAPH_DATA_DIR = MY_CONFIG.GRAPH_DATA_DIR
|
| 16 |
+
GRAPH_DATA_FILE = os.path.join(GRAPH_DATA_DIR, "graph-data-final.json")
|
| 17 |
+
|
| 18 |
+
class Neo4jConnection:
|
| 19 |
+
def __init__(self):
|
| 20 |
+
self.uri = MY_CONFIG.NEO4J_URI
|
| 21 |
+
self.username = MY_CONFIG.NEO4J_USER
|
| 22 |
+
self.password = MY_CONFIG.NEO4J_PASSWORD
|
| 23 |
+
self.database = getattr(MY_CONFIG, "NEO4J_DATABASE", None)
|
| 24 |
+
if not self.uri:
|
| 25 |
+
raise ValueError("NEO4J_URI config is required")
|
| 26 |
+
if not self.username:
|
| 27 |
+
raise ValueError("NEO4J_USERNAME config is required")
|
| 28 |
+
if not self.password:
|
| 29 |
+
raise ValueError("NEO4J_PASSWORD config is required")
|
| 30 |
+
if not self.database:
|
| 31 |
+
raise ValueError("NEO4J_DATABASE config is required")
|
| 32 |
+
self.driver: Optional[Driver] = None
|
| 33 |
+
|
| 34 |
+
async def connect(self):
|
| 35 |
+
if self.driver is None:
|
| 36 |
+
try:
|
| 37 |
+
self.driver = GraphDatabase.driver(
|
| 38 |
+
self.uri,
|
| 39 |
+
auth=(self.username, self.password)
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
await asyncio.get_event_loop().run_in_executor(
|
| 43 |
+
None, self.driver.verify_connectivity
|
| 44 |
+
)
|
| 45 |
+
logger.info("Connected to Neo4j")
|
| 46 |
+
|
| 47 |
+
except Exception as e:
|
| 48 |
+
logger.error(f"Connection failed: {e}")
|
| 49 |
+
self.driver = None
|
| 50 |
+
|
| 51 |
+
async def disconnect(self):
|
| 52 |
+
if self.driver:
|
| 53 |
+
await asyncio.get_event_loop().run_in_executor(
|
| 54 |
+
None, self.driver.close
|
| 55 |
+
)
|
| 56 |
+
self.driver = None
|
| 57 |
+
|
| 58 |
+
async def execute_query(self, query: str, parameters: Optional[Dict[str, Any]] = None):
|
| 59 |
+
if not self.driver:
|
| 60 |
+
raise ConnectionError("Not connected to Neo4j database")
|
| 61 |
+
|
| 62 |
+
def run_query():
|
| 63 |
+
with self.driver.session(database=self.database) as session:
|
| 64 |
+
result = session.run(query, parameters or {})
|
| 65 |
+
records = [record.data() for record in result]
|
| 66 |
+
summary = result.consume()
|
| 67 |
+
return records, summary
|
| 68 |
+
|
| 69 |
+
return await asyncio.get_event_loop().run_in_executor(None, run_query)
|
| 70 |
+
|
| 71 |
+
neo4j_connection = Neo4jConnection()
|
| 72 |
+
|
| 73 |
+
app = FastMCP("Neo4j Graph Data Upload Server")
|
| 74 |
+
|
| 75 |
+
@app.tool()
|
| 76 |
+
async def execute_cypher(query: str, parameters: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
| 77 |
+
try:
|
| 78 |
+
if not neo4j_connection.driver:
|
| 79 |
+
await neo4j_connection.connect()
|
| 80 |
+
if not neo4j_connection.driver:
|
| 81 |
+
return {
|
| 82 |
+
"status": "error",
|
| 83 |
+
"error": "Unable to connect to Neo4j database",
|
| 84 |
+
"details": "Check connection settings and network connectivity"
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
records, summary = await neo4j_connection.execute_query(query, parameters)
|
| 88 |
+
|
| 89 |
+
return {
|
| 90 |
+
"status": "success",
|
| 91 |
+
"query": query,
|
| 92 |
+
"parameters": parameters or {},
|
| 93 |
+
"records": records,
|
| 94 |
+
"record_count": len(records),
|
| 95 |
+
"execution_time_ms": summary.result_available_after,
|
| 96 |
+
"summary": {
|
| 97 |
+
"query_type": summary.query_type,
|
| 98 |
+
"counters": dict(summary.counters) if summary.counters else {}
|
| 99 |
+
}
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
except Exception as e:
|
| 103 |
+
return {
|
| 104 |
+
"status": "error",
|
| 105 |
+
"query": query,
|
| 106 |
+
"error": str(e)
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
@app.tool()
|
| 111 |
+
async def get_database_schema() -> Dict[str, Any]:
|
| 112 |
+
try:
|
| 113 |
+
if not neo4j_connection.driver:
|
| 114 |
+
await neo4j_connection.connect()
|
| 115 |
+
if not neo4j_connection.driver:
|
| 116 |
+
return {
|
| 117 |
+
"status": "error",
|
| 118 |
+
"error": "Unable to connect to Neo4j database"
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
labels_records, _ = await neo4j_connection.execute_query("CALL db.labels()")
|
| 122 |
+
labels = [record["label"] for record in labels_records]
|
| 123 |
+
|
| 124 |
+
rel_records, _ = await neo4j_connection.execute_query("CALL db.relationshipTypes()")
|
| 125 |
+
relationships = [record["relationshipType"] for record in rel_records]
|
| 126 |
+
|
| 127 |
+
prop_records, _ = await neo4j_connection.execute_query("CALL db.propertyKeys()")
|
| 128 |
+
properties = [record["propertyKey"] for record in prop_records]
|
| 129 |
+
|
| 130 |
+
try:
|
| 131 |
+
constraint_records, _ = await neo4j_connection.execute_query("SHOW CONSTRAINTS")
|
| 132 |
+
constraints = [dict(record) for record in constraint_records]
|
| 133 |
+
except Exception:
|
| 134 |
+
constraints = []
|
| 135 |
+
|
| 136 |
+
try:
|
| 137 |
+
index_records, _ = await neo4j_connection.execute_query("SHOW INDEXES")
|
| 138 |
+
indexes = [dict(record) for record in index_records]
|
| 139 |
+
except Exception:
|
| 140 |
+
indexes = []
|
| 141 |
+
|
| 142 |
+
return {
|
| 143 |
+
"status": "success",
|
| 144 |
+
"schema": {
|
| 145 |
+
"node_labels": labels,
|
| 146 |
+
"relationship_types": relationships,
|
| 147 |
+
"property_keys": properties,
|
| 148 |
+
"constraints": constraints,
|
| 149 |
+
"indexes": indexes
|
| 150 |
+
}
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
except Exception as e:
|
| 154 |
+
return {
|
| 155 |
+
"status": "error",
|
| 156 |
+
"error": str(e)
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
@app.tool()
|
| 161 |
+
async def get_node_count(label: Optional[str] = None) -> Dict[str, Any]:
|
| 162 |
+
try:
|
| 163 |
+
if not neo4j_connection.driver:
|
| 164 |
+
await neo4j_connection.connect()
|
| 165 |
+
if not neo4j_connection.driver:
|
| 166 |
+
return {
|
| 167 |
+
"status": "error",
|
| 168 |
+
"error": "Unable to connect to Neo4j database"
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
if label:
|
| 172 |
+
query = f"MATCH (n:`{label}`) RETURN count(n) as count"
|
| 173 |
+
else:
|
| 174 |
+
query = "MATCH (n) RETURN count(n) as count"
|
| 175 |
+
|
| 176 |
+
records, _ = await neo4j_connection.execute_query(query)
|
| 177 |
+
count = records[0]["count"] if records else 0
|
| 178 |
+
|
| 179 |
+
return {
|
| 180 |
+
"status": "success",
|
| 181 |
+
"label": label,
|
| 182 |
+
"count": count
|
| 183 |
+
}
|
| 184 |
+
|
| 185 |
+
except Exception as e:
|
| 186 |
+
return {
|
| 187 |
+
"status": "error",
|
| 188 |
+
"error": str(e)
|
| 189 |
+
}
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
@app.tool()
|
| 193 |
+
async def get_relationship_count(relationship_type: Optional[str] = None) -> Dict[str, Any]:
|
| 194 |
+
try:
|
| 195 |
+
if not neo4j_connection.driver:
|
| 196 |
+
await neo4j_connection.connect()
|
| 197 |
+
if not neo4j_connection.driver:
|
| 198 |
+
return {
|
| 199 |
+
"status": "error",
|
| 200 |
+
"error": "Unable to connect to Neo4j database"
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
if relationship_type:
|
| 204 |
+
query = f"MATCH ()-[r:`{relationship_type}`]-() RETURN count(r) as count"
|
| 205 |
+
else:
|
| 206 |
+
query = "MATCH ()-[r]-() RETURN count(r) as count"
|
| 207 |
+
|
| 208 |
+
records, _ = await neo4j_connection.execute_query(query)
|
| 209 |
+
count = records[0]["count"] if records else 0
|
| 210 |
+
|
| 211 |
+
return {
|
| 212 |
+
"status": "success",
|
| 213 |
+
"relationship_type": relationship_type,
|
| 214 |
+
"count": count
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
except Exception as e:
|
| 218 |
+
return {
|
| 219 |
+
"status": "error",
|
| 220 |
+
"error": str(e)
|
| 221 |
+
}
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
@app.tool()
|
| 225 |
+
async def health_check() -> Dict[str, Any]:
|
| 226 |
+
try:
|
| 227 |
+
if not neo4j_connection.driver:
|
| 228 |
+
await neo4j_connection.connect()
|
| 229 |
+
|
| 230 |
+
if not neo4j_connection.driver:
|
| 231 |
+
return {
|
| 232 |
+
"status": "unhealthy",
|
| 233 |
+
"reason": "Unable to connect to Neo4j database",
|
| 234 |
+
"configuration": {
|
| 235 |
+
"uri": neo4j_connection.uri,
|
| 236 |
+
"database": neo4j_connection.database,
|
| 237 |
+
"username": neo4j_connection.username
|
| 238 |
+
}
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
+
# A simple query to test connectivity
|
| 242 |
+
records, _ = await neo4j_connection.execute_query("RETURN 1 as test")
|
| 243 |
+
|
| 244 |
+
if records and records[0]["test"] == 1:
|
| 245 |
+
return {
|
| 246 |
+
"status": "healthy",
|
| 247 |
+
"database": neo4j_connection.database,
|
| 248 |
+
"uri": neo4j_connection.uri,
|
| 249 |
+
"ssl_enabled": neo4j_connection.uri.startswith(('neo4j+s://', 'bolt+s://')),
|
| 250 |
+
"message": "Neo4j connection is working properly"
|
| 251 |
+
}
|
| 252 |
+
else:
|
| 253 |
+
return {
|
| 254 |
+
"status": "unhealthy",
|
| 255 |
+
"reason": "Query execution failed or returned unexpected results"
|
| 256 |
+
}
|
| 257 |
+
|
| 258 |
+
except Exception as e:
|
| 259 |
+
return {
|
| 260 |
+
"status": "unhealthy",
|
| 261 |
+
"reason": str(e)
|
| 262 |
+
}
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
async def clear_database_impl() -> Dict[str, Any]:
|
| 266 |
+
try:
|
| 267 |
+
if not neo4j_connection.driver:
|
| 268 |
+
await neo4j_connection.connect()
|
| 269 |
+
if not neo4j_connection.driver:
|
| 270 |
+
return {
|
| 271 |
+
"status": "error",
|
| 272 |
+
"error": "Unable to connect to Neo4j database"
|
| 273 |
+
}
|
| 274 |
+
|
| 275 |
+
node_count_query = "MATCH (n) RETURN count(n) as count"
|
| 276 |
+
rel_count_query = "MATCH ()-[r]->() RETURN count(r) as count"
|
| 277 |
+
|
| 278 |
+
node_records, _ = await neo4j_connection.execute_query(node_count_query)
|
| 279 |
+
rel_records, _ = await neo4j_connection.execute_query(rel_count_query)
|
| 280 |
+
|
| 281 |
+
nodes_before = node_records[0]["count"] if node_records else 0
|
| 282 |
+
rels_before = rel_records[0]["count"] if rel_records else 0
|
| 283 |
+
|
| 284 |
+
await neo4j_connection.execute_query("MATCH ()-[r]->() DELETE r")
|
| 285 |
+
await neo4j_connection.execute_query("MATCH (n) DELETE n")
|
| 286 |
+
|
| 287 |
+
print(f"✅ Cleared: {nodes_before} nodes, {rels_before} relationships")
|
| 288 |
+
|
| 289 |
+
return {
|
| 290 |
+
"status": "success",
|
| 291 |
+
"message": "Database cleared successfully",
|
| 292 |
+
"statistics": {
|
| 293 |
+
"nodes_removed": nodes_before,
|
| 294 |
+
"relationships_removed": rels_before
|
| 295 |
+
}
|
| 296 |
+
}
|
| 297 |
+
|
| 298 |
+
except Exception as e:
|
| 299 |
+
return {
|
| 300 |
+
"status": "error",
|
| 301 |
+
"error": str(e)
|
| 302 |
+
}
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
@app.tool()
|
| 306 |
+
async def clear_database() -> Dict[str, Any]:
|
| 307 |
+
return await clear_database_impl()
|
| 308 |
+
|
| 309 |
+
|
| 310 |
+
async def upload_graph_data_impl() -> Dict[str, Any]:
|
| 311 |
+
try:
|
| 312 |
+
if not neo4j_connection.driver:
|
| 313 |
+
await neo4j_connection.connect()
|
| 314 |
+
if not neo4j_connection.driver:
|
| 315 |
+
return {
|
| 316 |
+
"status": "error",
|
| 317 |
+
"error": "Unable to connect to Neo4j database"
|
| 318 |
+
}
|
| 319 |
+
|
| 320 |
+
clear_result = await clear_database_impl()
|
| 321 |
+
if clear_result["status"] != "success":
|
| 322 |
+
return clear_result
|
| 323 |
+
|
| 324 |
+
# Check if graph data file exists
|
| 325 |
+
if not os.path.exists(GRAPH_DATA_FILE):
|
| 326 |
+
return {
|
| 327 |
+
"status": "error",
|
| 328 |
+
"error": f"Graph data file not found: {GRAPH_DATA_FILE}"
|
| 329 |
+
}
|
| 330 |
+
|
| 331 |
+
with open(GRAPH_DATA_FILE, 'r', encoding='utf-8') as f:
|
| 332 |
+
graph_data = json.load(f)
|
| 333 |
+
|
| 334 |
+
if not isinstance(graph_data, dict) or 'nodes' not in graph_data:
|
| 335 |
+
return {
|
| 336 |
+
"status": "error",
|
| 337 |
+
"error": "Invalid graph data format. Expected JSON with 'nodes' array"
|
| 338 |
+
}
|
| 339 |
+
|
| 340 |
+
nodes = graph_data.get('nodes', [])
|
| 341 |
+
relationships = graph_data.get('relationships', [])
|
| 342 |
+
communities_data = graph_data.get('communities', {})
|
| 343 |
+
drift_metadata = graph_data.get('drift_search_metadata', {})
|
| 344 |
+
global_metadata = graph_data.get('metadata', {})
|
| 345 |
+
search_optimization = drift_metadata.get('search_optimization', {}) if drift_metadata else {}
|
| 346 |
+
|
| 347 |
+
communities_count = len(drift_metadata.get('community_search_index', {})) if drift_metadata else 0
|
| 348 |
+
drift_count = 1 if drift_metadata else 0
|
| 349 |
+
metadata_count = 1 if global_metadata else 0
|
| 350 |
+
optimization_count = 1 if search_optimization else 0
|
| 351 |
+
communities_metadata_count = 1 if communities_data else 0
|
| 352 |
+
drift_config_count = 1 if (drift_metadata and 'configuration' in drift_metadata) else 0
|
| 353 |
+
community_search_index_count = 1 if (drift_metadata and 'community_search_index' in drift_metadata) else 0
|
| 354 |
+
search_optimization_object_count = 1 if (drift_metadata and 'search_optimization' in drift_metadata) else 0
|
| 355 |
+
embeddings_object_count = 1 if (drift_metadata and 'community_search_index' in drift_metadata) else 0
|
| 356 |
+
embeddings_count = communities_count if (drift_metadata and 'community_search_index' in drift_metadata) else 0
|
| 357 |
+
|
| 358 |
+
total_items = (len(nodes) + len(relationships) + communities_count + drift_count +
|
| 359 |
+
metadata_count + optimization_count + communities_metadata_count +
|
| 360 |
+
drift_config_count + community_search_index_count +
|
| 361 |
+
search_optimization_object_count + embeddings_object_count + embeddings_count)
|
| 362 |
+
|
| 363 |
+
print(f"Processing: {len(nodes)} nodes, {len(relationships)} relationships, {communities_count} communities, {total_items - len(nodes) - len(relationships) - communities_count} metadata")
|
| 364 |
+
|
| 365 |
+
upload_stats = {
|
| 366 |
+
"nodes_processed": 0,
|
| 367 |
+
"nodes_created": 0,
|
| 368 |
+
"relationships_processed": 0,
|
| 369 |
+
"relationships_created": 0,
|
| 370 |
+
"communities_processed": 0,
|
| 371 |
+
"communities_created": 0,
|
| 372 |
+
"drift_metadata_created": 0,
|
| 373 |
+
"global_metadata_created": 0,
|
| 374 |
+
"search_optimization_created": 0,
|
| 375 |
+
"communities_metadata_created": 0,
|
| 376 |
+
"drift_config_created": 0,
|
| 377 |
+
"community_search_index_created": 0,
|
| 378 |
+
"search_optimization_object_created": 0,
|
| 379 |
+
"embeddings_object_created": 0,
|
| 380 |
+
"embeddings_created": 0,
|
| 381 |
+
"errors": []
|
| 382 |
+
}
|
| 383 |
+
|
| 384 |
+
with tqdm(total=len(nodes), desc="Nodes", unit="node", ncols=80, leave=False) as pbar:
|
| 385 |
+
for node in nodes:
|
| 386 |
+
try:
|
| 387 |
+
upload_stats["nodes_processed"] += 1
|
| 388 |
+
|
| 389 |
+
node_id = node['id']
|
| 390 |
+
labels = node['labels']
|
| 391 |
+
properties = node.get('properties', {})
|
| 392 |
+
|
| 393 |
+
# Create node with labels
|
| 394 |
+
labels_str = ':'.join([f"`{label}`" for label in labels])
|
| 395 |
+
query = f"MERGE (n:{labels_str} {{id: $id}}) SET n += $props RETURN n"
|
| 396 |
+
|
| 397 |
+
await neo4j_connection.execute_query(query, {
|
| 398 |
+
"id": node_id,
|
| 399 |
+
"props": properties
|
| 400 |
+
})
|
| 401 |
+
|
| 402 |
+
upload_stats["nodes_created"] += 1
|
| 403 |
+
pbar.update(1)
|
| 404 |
+
|
| 405 |
+
except Exception as e:
|
| 406 |
+
upload_stats["errors"].append(f"Node upload error: {str(e)}")
|
| 407 |
+
pbar.update(1)
|
| 408 |
+
|
| 409 |
+
with tqdm(total=len(relationships), desc="Relationships", unit="rel", ncols=80, leave=False) as pbar:
|
| 410 |
+
for rel in relationships:
|
| 411 |
+
upload_stats["relationships_processed"] += 1
|
| 412 |
+
|
| 413 |
+
start_node = rel['startNode']
|
| 414 |
+
end_node = rel['endNode']
|
| 415 |
+
rel_type = rel['type']
|
| 416 |
+
|
| 417 |
+
try:
|
| 418 |
+
query = f"""
|
| 419 |
+
MATCH (a {{id: $start_node}})
|
| 420 |
+
MATCH (b {{id: $end_node}})
|
| 421 |
+
CREATE (a)-[r:`{rel_type}`]->(b)
|
| 422 |
+
SET r += $props
|
| 423 |
+
RETURN r
|
| 424 |
+
"""
|
| 425 |
+
|
| 426 |
+
await neo4j_connection.execute_query(query, {
|
| 427 |
+
"start_node": start_node,
|
| 428 |
+
"end_node": end_node,
|
| 429 |
+
"props": properties
|
| 430 |
+
})
|
| 431 |
+
|
| 432 |
+
upload_stats["relationships_created"] += 1
|
| 433 |
+
pbar.update(1)
|
| 434 |
+
|
| 435 |
+
except Exception as e:
|
| 436 |
+
error_msg = f"Relationship upload error for rel {rel}: {str(e)}"
|
| 437 |
+
logger.error(error_msg)
|
| 438 |
+
upload_stats["errors"].append(error_msg)
|
| 439 |
+
pbar.update(1)
|
| 440 |
+
|
| 441 |
+
if drift_metadata and 'community_search_index' in drift_metadata:
|
| 442 |
+
community_index = drift_metadata['community_search_index']
|
| 443 |
+
|
| 444 |
+
with tqdm(total=len(community_index), desc="Communities", unit="comm", ncols=80, leave=False) as pbar:
|
| 445 |
+
for comm_id, comm_data in community_index.items():
|
| 446 |
+
try:
|
| 447 |
+
upload_stats["communities_processed"] += 1
|
| 448 |
+
|
| 449 |
+
embeddings = comm_data.get('embeddings', {})
|
| 450 |
+
summary_embedding = embeddings.get('summary_embedding', [])
|
| 451 |
+
hyde_embeddings = embeddings.get('hyde_embeddings', [])
|
| 452 |
+
|
| 453 |
+
follow_up_templates_json = json.dumps(comm_data.get('follow_up_templates', {}))
|
| 454 |
+
hyde_embeddings_json = json.dumps(hyde_embeddings)
|
| 455 |
+
|
| 456 |
+
# Get statistics
|
| 457 |
+
stats = comm_data.get('statistics', {})
|
| 458 |
+
|
| 459 |
+
# Community properties with documented attributes from JSON
|
| 460 |
+
community_props = {
|
| 461 |
+
"id": comm_id,
|
| 462 |
+
"summary": comm_data.get('summary', ''),
|
| 463 |
+
"key_entities": comm_data.get('key_entities', []),
|
| 464 |
+
"member_count": stats.get('member_count', 0),
|
| 465 |
+
"member_ids": stats.get('member_ids', []),
|
| 466 |
+
"internal_edges": stats.get('internal_edges', 0),
|
| 467 |
+
"density": stats.get('density', 0.0),
|
| 468 |
+
"avg_degree": stats.get('avg_degree', 0.0),
|
| 469 |
+
"follow_up_templates": follow_up_templates_json,
|
| 470 |
+
"hyde_embeddings": hyde_embeddings_json
|
| 471 |
+
}
|
| 472 |
+
|
| 473 |
+
# Add summary embedding as List<Float> if available
|
| 474 |
+
if summary_embedding and isinstance(summary_embedding, list):
|
| 475 |
+
community_props["summary_embedding"] = summary_embedding
|
| 476 |
+
community_props["embedding_dimensions"] = len(summary_embedding)
|
| 477 |
+
|
| 478 |
+
# Create Community node
|
| 479 |
+
query = """
|
| 480 |
+
MERGE (c:Community {id: $id})
|
| 481 |
+
SET c += $props
|
| 482 |
+
RETURN c
|
| 483 |
+
"""
|
| 484 |
+
|
| 485 |
+
await neo4j_connection.execute_query(query, {
|
| 486 |
+
"id": comm_id,
|
| 487 |
+
"props": community_props
|
| 488 |
+
})
|
| 489 |
+
|
| 490 |
+
upload_stats["communities_created"] += 1
|
| 491 |
+
pbar.update(1)
|
| 492 |
+
|
| 493 |
+
except Exception as e:
|
| 494 |
+
error_msg = f"Community upload error for {comm_id}: {str(e)}"
|
| 495 |
+
logger.error(error_msg)
|
| 496 |
+
upload_stats["errors"].append(error_msg)
|
| 497 |
+
pbar.update(1)
|
| 498 |
+
|
| 499 |
+
if drift_metadata:
|
| 500 |
+
try:
|
| 501 |
+
query_routing_config_json = json.dumps(drift_metadata.get('query_routing_config', {}))
|
| 502 |
+
performance_monitoring_json = json.dumps(drift_metadata.get('performance_monitoring', {}))
|
| 503 |
+
configuration_json = json.dumps(drift_metadata.get('configuration', {}))
|
| 504 |
+
community_search_index_json = json.dumps(drift_metadata.get('community_search_index', {}))
|
| 505 |
+
search_optimization_json = json.dumps(drift_metadata.get('search_optimization', {}))
|
| 506 |
+
|
| 507 |
+
# Build a compact embeddings object (per-community) to store on the DRIFT node
|
| 508 |
+
embeddings_per_community = {}
|
| 509 |
+
for _comm_id, _comm_data in drift_metadata.get('community_search_index', {}).items():
|
| 510 |
+
emb = _comm_data.get('embeddings')
|
| 511 |
+
if emb:
|
| 512 |
+
# Only keep summary and hyde to limit size
|
| 513 |
+
embeddings_per_community[_comm_id] = {
|
| 514 |
+
'summary_embedding': emb.get('summary_embedding'),
|
| 515 |
+
'hyde_embeddings': emb.get('hyde_embeddings')
|
| 516 |
+
}
|
| 517 |
+
|
| 518 |
+
embeddings_json = json.dumps(embeddings_per_community)
|
| 519 |
+
|
| 520 |
+
drift_props = {
|
| 521 |
+
"version": drift_metadata.get('version', '1.0'),
|
| 522 |
+
"generated_timestamp": drift_metadata.get('generated_timestamp', ''),
|
| 523 |
+
"query_routing_config": query_routing_config_json,
|
| 524 |
+
"performance_monitoring": performance_monitoring_json,
|
| 525 |
+
"configuration": configuration_json,
|
| 526 |
+
# Nested objects stored as JSON strings for direct inspection
|
| 527 |
+
"community_search_index": community_search_index_json,
|
| 528 |
+
"search_optimization": search_optimization_json,
|
| 529 |
+
"embeddings": embeddings_json,
|
| 530 |
+
"total_communities": len(drift_metadata.get('community_search_index', {}))
|
| 531 |
+
}
|
| 532 |
+
|
| 533 |
+
# Create single DRIFT metadata node
|
| 534 |
+
query = """
|
| 535 |
+
MERGE (d:DriftMetadata {version: $version})
|
| 536 |
+
SET d += $props
|
| 537 |
+
RETURN d
|
| 538 |
+
"""
|
| 539 |
+
|
| 540 |
+
await neo4j_connection.execute_query(query, {
|
| 541 |
+
"version": drift_metadata.get('version', '1.0'),
|
| 542 |
+
"props": drift_props
|
| 543 |
+
})
|
| 544 |
+
|
| 545 |
+
upload_stats["drift_metadata_created"] = 1
|
| 546 |
+
|
| 547 |
+
except Exception as e:
|
| 548 |
+
error_msg = f"DRIFT metadata upload error: {str(e)}"
|
| 549 |
+
logger.error(error_msg)
|
| 550 |
+
upload_stats["errors"].append(error_msg)
|
| 551 |
+
|
| 552 |
+
if global_metadata:
|
| 553 |
+
try:
|
| 554 |
+
# Convert nested objects to JSON strings for Neo4j compatibility
|
| 555 |
+
recovery_stats_json = json.dumps(global_metadata.get('recovery_stats', {}))
|
| 556 |
+
member_extraction_stats_json = json.dumps(global_metadata.get('member_extraction_stats', {}))
|
| 557 |
+
community_detection_json = json.dumps(global_metadata.get('community_detection', {}))
|
| 558 |
+
|
| 559 |
+
metadata_props = {
|
| 560 |
+
"node_count": global_metadata.get('node_count', 0),
|
| 561 |
+
"relationship_count": global_metadata.get('relationship_count', 0),
|
| 562 |
+
"generated_at": global_metadata.get('generated_at', ''),
|
| 563 |
+
"generator": global_metadata.get('generator', ''),
|
| 564 |
+
"llm_provider": global_metadata.get('llm_provider', ''),
|
| 565 |
+
"model": global_metadata.get('model', ''),
|
| 566 |
+
"format_version": global_metadata.get('format_version', ''),
|
| 567 |
+
"last_updated": global_metadata.get('last_updated', ''),
|
| 568 |
+
"phase": global_metadata.get('phase', ''),
|
| 569 |
+
"entity_count": global_metadata.get('entity_count', global_metadata.get('node_count', 0)),
|
| 570 |
+
"community_count": global_metadata.get('community_count', 0),
|
| 571 |
+
"total_node_count": global_metadata.get('total_node_count', global_metadata.get('node_count', 0)),
|
| 572 |
+
"total_relationship_count": global_metadata.get('total_relationship_count', global_metadata.get('relationship_count', 0)),
|
| 573 |
+
|
| 574 |
+
# Complex nested objects as JSON strings
|
| 575 |
+
"recovery_stats": recovery_stats_json,
|
| 576 |
+
"member_extraction_stats": member_extraction_stats_json,
|
| 577 |
+
"community_detection": community_detection_json
|
| 578 |
+
}
|
| 579 |
+
|
| 580 |
+
# Create Global Metadata node
|
| 581 |
+
query = """
|
| 582 |
+
MERGE (m:GraphMetadata {generator: $generator})
|
| 583 |
+
SET m += $props
|
| 584 |
+
RETURN m
|
| 585 |
+
"""
|
| 586 |
+
|
| 587 |
+
await neo4j_connection.execute_query(query, {
|
| 588 |
+
"generator": global_metadata.get('generator', 'unknown'),
|
| 589 |
+
"props": metadata_props
|
| 590 |
+
})
|
| 591 |
+
|
| 592 |
+
upload_stats["global_metadata_created"] = 1
|
| 593 |
+
|
| 594 |
+
except Exception as e:
|
| 595 |
+
error_msg = f"Global metadata upload error: {str(e)}"
|
| 596 |
+
logger.error(error_msg)
|
| 597 |
+
upload_stats["errors"].append(error_msg)
|
| 598 |
+
|
| 599 |
+
if search_optimization:
|
| 600 |
+
try:
|
| 601 |
+
optimization_props = {
|
| 602 |
+
"total_communities": search_optimization.get('total_communities', 0),
|
| 603 |
+
"avg_community_size": search_optimization.get('avg_community_size', 0.0),
|
| 604 |
+
"graph_density": search_optimization.get('graph_density', 0.0),
|
| 605 |
+
"total_nodes": search_optimization.get('total_nodes', 0),
|
| 606 |
+
"total_edges": search_optimization.get('total_edges', 0),
|
| 607 |
+
"max_primer_communities": search_optimization.get('max_primer_communities', 0)
|
| 608 |
+
}
|
| 609 |
+
|
| 610 |
+
query = """
|
| 611 |
+
MERGE (s:SearchOptimization {id: 'global'})
|
| 612 |
+
SET s += $props
|
| 613 |
+
RETURN s
|
| 614 |
+
"""
|
| 615 |
+
|
| 616 |
+
await neo4j_connection.execute_query(query, {
|
| 617 |
+
"props": optimization_props
|
| 618 |
+
})
|
| 619 |
+
|
| 620 |
+
upload_stats["search_optimization_created"] = 1
|
| 621 |
+
|
| 622 |
+
except Exception as e:
|
| 623 |
+
error_msg = f"Search optimization upload error: {str(e)}"
|
| 624 |
+
logger.error(error_msg)
|
| 625 |
+
upload_stats["errors"].append(error_msg)
|
| 626 |
+
|
| 627 |
+
if communities_data:
|
| 628 |
+
try:
|
| 629 |
+
communities_props = {
|
| 630 |
+
"algorithm": communities_data.get('algorithm', ''),
|
| 631 |
+
"total_communities": communities_data.get('total_communities', 0),
|
| 632 |
+
"modularity_score": communities_data.get('modularity_score', 0.0),
|
| 633 |
+
"summaries": json.dumps(communities_data.get('summaries', {})),
|
| 634 |
+
"statistics": json.dumps(communities_data.get('statistics', {}))
|
| 635 |
+
}
|
| 636 |
+
|
| 637 |
+
query = """
|
| 638 |
+
MERGE (cm:CommunitiesMetadata {algorithm: $algorithm})
|
| 639 |
+
SET cm += $props
|
| 640 |
+
RETURN cm
|
| 641 |
+
"""
|
| 642 |
+
|
| 643 |
+
await neo4j_connection.execute_query(query, {
|
| 644 |
+
"algorithm": communities_data.get('algorithm', 'unknown'),
|
| 645 |
+
"props": communities_props
|
| 646 |
+
})
|
| 647 |
+
|
| 648 |
+
upload_stats["communities_metadata_created"] = 1
|
| 649 |
+
|
| 650 |
+
except Exception as e:
|
| 651 |
+
error_msg = f"Communities metadata upload error: {str(e)}"
|
| 652 |
+
logger.error(error_msg)
|
| 653 |
+
upload_stats["errors"].append(error_msg)
|
| 654 |
+
|
| 655 |
+
if drift_metadata and 'configuration' in drift_metadata:
|
| 656 |
+
try:
|
| 657 |
+
config = drift_metadata['configuration']
|
| 658 |
+
|
| 659 |
+
drift_config_props = {
|
| 660 |
+
"max_iterations": config.get('max_iterations', 0),
|
| 661 |
+
"confidence_threshold": config.get('confidence_threshold', 0.0),
|
| 662 |
+
"top_k_communities": config.get('top_k_communities', 0),
|
| 663 |
+
"hyde_expansion_count": config.get('hyde_expansion_count', 0),
|
| 664 |
+
"termination_criteria": config.get('termination_criteria', ''),
|
| 665 |
+
"version": drift_metadata.get('version', '1.0'),
|
| 666 |
+
"generated_timestamp": drift_metadata.get('generated_timestamp', '')
|
| 667 |
+
}
|
| 668 |
+
|
| 669 |
+
query = """
|
| 670 |
+
MERGE (dc:DriftConfiguration {version: $version})
|
| 671 |
+
SET dc += $props
|
| 672 |
+
RETURN dc
|
| 673 |
+
"""
|
| 674 |
+
|
| 675 |
+
await neo4j_connection.execute_query(query, {
|
| 676 |
+
"version": drift_metadata.get('version', '1.0'),
|
| 677 |
+
"props": drift_config_props
|
| 678 |
+
})
|
| 679 |
+
|
| 680 |
+
upload_stats["drift_config_created"] = 1
|
| 681 |
+
|
| 682 |
+
except Exception as e:
|
| 683 |
+
error_msg = f"DRIFT Configuration upload error: {str(e)}"
|
| 684 |
+
logger.error(error_msg)
|
| 685 |
+
upload_stats["errors"].append(error_msg)
|
| 686 |
+
|
| 687 |
+
if drift_metadata and 'community_search_index' in drift_metadata:
|
| 688 |
+
try:
|
| 689 |
+
community_search_index = drift_metadata['community_search_index']
|
| 690 |
+
|
| 691 |
+
search_index_props = {
|
| 692 |
+
"version": drift_metadata.get('version', '1.0'),
|
| 693 |
+
"total_communities": len(community_search_index),
|
| 694 |
+
"community_data": json.dumps(community_search_index),
|
| 695 |
+
"generated_timestamp": drift_metadata.get('generated_timestamp', ''),
|
| 696 |
+
"index_type": "community_search"
|
| 697 |
+
}
|
| 698 |
+
|
| 699 |
+
query = """
|
| 700 |
+
MERGE (csi:CommunitySearchIndex {version: $version})
|
| 701 |
+
SET csi += $props
|
| 702 |
+
RETURN csi
|
| 703 |
+
"""
|
| 704 |
+
|
| 705 |
+
await neo4j_connection.execute_query(query, {
|
| 706 |
+
"version": drift_metadata.get('version', '1.0'),
|
| 707 |
+
"props": search_index_props
|
| 708 |
+
})
|
| 709 |
+
|
| 710 |
+
upload_stats["community_search_index_created"] = 1
|
| 711 |
+
|
| 712 |
+
except Exception as e:
|
| 713 |
+
error_msg = f"Community Search Index upload error: {str(e)}"
|
| 714 |
+
logger.error(error_msg)
|
| 715 |
+
upload_stats["errors"].append(error_msg)
|
| 716 |
+
|
| 717 |
+
if drift_metadata and 'search_optimization' in drift_metadata:
|
| 718 |
+
try:
|
| 719 |
+
search_opt_data = drift_metadata['search_optimization']
|
| 720 |
+
|
| 721 |
+
search_opt_props = {
|
| 722 |
+
"total_communities": search_opt_data.get('total_communities', 0),
|
| 723 |
+
"avg_community_size": search_opt_data.get('avg_community_size', 0.0),
|
| 724 |
+
"graph_density": search_opt_data.get('graph_density', 0.0),
|
| 725 |
+
"total_nodes": search_opt_data.get('total_nodes', 0),
|
| 726 |
+
"total_edges": search_opt_data.get('total_edges', 0),
|
| 727 |
+
"max_primer_communities": search_opt_data.get('max_primer_communities', 0),
|
| 728 |
+
"optimization_version": drift_metadata.get('version', '1.0')
|
| 729 |
+
}
|
| 730 |
+
|
| 731 |
+
query = """
|
| 732 |
+
MERGE (so:SearchOptimizationObject {optimization_version: $version})
|
| 733 |
+
SET so += $props
|
| 734 |
+
RETURN so
|
| 735 |
+
"""
|
| 736 |
+
|
| 737 |
+
await neo4j_connection.execute_query(query, {
|
| 738 |
+
"version": drift_metadata.get('version', '1.0'),
|
| 739 |
+
"props": search_opt_props
|
| 740 |
+
})
|
| 741 |
+
|
| 742 |
+
upload_stats["search_optimization_object_created"] = 1
|
| 743 |
+
|
| 744 |
+
except Exception as e:
|
| 745 |
+
error_msg = f"Search Optimization object upload error: {str(e)}"
|
| 746 |
+
logger.error(error_msg)
|
| 747 |
+
upload_stats["errors"].append(error_msg)
|
| 748 |
+
|
| 749 |
+
if drift_metadata and 'community_search_index' in drift_metadata:
|
| 750 |
+
try:
|
| 751 |
+
community_index = drift_metadata['community_search_index']
|
| 752 |
+
|
| 753 |
+
total_embeddings = 0
|
| 754 |
+
total_dimensions = 0
|
| 755 |
+
embedding_communities = []
|
| 756 |
+
|
| 757 |
+
for comm_id, comm_data in community_index.items():
|
| 758 |
+
embeddings_data = comm_data.get('embeddings', {})
|
| 759 |
+
if embeddings_data:
|
| 760 |
+
total_embeddings += 1
|
| 761 |
+
if embeddings_data.get('summary_embedding'):
|
| 762 |
+
total_dimensions = len(embeddings_data.get('summary_embedding', []))
|
| 763 |
+
embedding_communities.append(comm_id)
|
| 764 |
+
|
| 765 |
+
# Create embeddings object properties
|
| 766 |
+
embeddings_obj_props = {
|
| 767 |
+
"total_embeddings": total_embeddings,
|
| 768 |
+
"embedding_dimensions": total_dimensions,
|
| 769 |
+
"embedding_computation": "computed via text-embedding-ada-002",
|
| 770 |
+
"communities_with_embeddings": embedding_communities,
|
| 771 |
+
"embedding_type": "community_summaries",
|
| 772 |
+
"embeddings_version": drift_metadata.get('version', '1.0')
|
| 773 |
+
}
|
| 774 |
+
|
| 775 |
+
query = """
|
| 776 |
+
MERGE (eo:EmbeddingsObject {embeddings_version: $version})
|
| 777 |
+
SET eo += $props
|
| 778 |
+
RETURN eo
|
| 779 |
+
"""
|
| 780 |
+
|
| 781 |
+
await neo4j_connection.execute_query(query, {
|
| 782 |
+
"version": drift_metadata.get('version', '1.0'),
|
| 783 |
+
"props": embeddings_obj_props
|
| 784 |
+
})
|
| 785 |
+
|
| 786 |
+
upload_stats["embeddings_object_created"] = 1
|
| 787 |
+
|
| 788 |
+
except Exception as e:
|
| 789 |
+
error_msg = f"Embeddings object upload error: {str(e)}"
|
| 790 |
+
logger.error(error_msg)
|
| 791 |
+
upload_stats["errors"].append(error_msg)
|
| 792 |
+
|
| 793 |
+
if drift_metadata and 'community_search_index' in drift_metadata:
|
| 794 |
+
community_index = drift_metadata['community_search_index']
|
| 795 |
+
|
| 796 |
+
for comm_id, comm_data in community_index.items():
|
| 797 |
+
try:
|
| 798 |
+
embeddings_data = comm_data.get('embeddings', {})
|
| 799 |
+
if embeddings_data:
|
| 800 |
+
embeddings_props = {
|
| 801 |
+
"community_id": comm_id,
|
| 802 |
+
"summary_embedding": embeddings_data.get('summary_embedding', []),
|
| 803 |
+
"hyde_embeddings": json.dumps(embeddings_data.get('hyde_embeddings', [])),
|
| 804 |
+
"embedding_dimensions": len(embeddings_data.get('summary_embedding', [])),
|
| 805 |
+
"embedding_computation": embeddings_data.get('embedding_computation', 'computed')
|
| 806 |
+
}
|
| 807 |
+
|
| 808 |
+
query = """
|
| 809 |
+
MERGE (e:Embeddings {community_id: $community_id})
|
| 810 |
+
SET e += $props
|
| 811 |
+
RETURN e
|
| 812 |
+
"""
|
| 813 |
+
|
| 814 |
+
await neo4j_connection.execute_query(query, {
|
| 815 |
+
"community_id": comm_id,
|
| 816 |
+
"props": embeddings_props
|
| 817 |
+
})
|
| 818 |
+
|
| 819 |
+
upload_stats["embeddings_created"] += 1
|
| 820 |
+
|
| 821 |
+
except Exception as e:
|
| 822 |
+
error_msg = f"Embeddings upload error for {comm_id}: {str(e)}"
|
| 823 |
+
logger.error(error_msg)
|
| 824 |
+
upload_stats["errors"].append(error_msg)
|
| 825 |
+
|
| 826 |
+
if communities_count > 0:
|
| 827 |
+
try:
|
| 828 |
+
# Connect entities to their communities based on community_id property
|
| 829 |
+
community_rel_query = """
|
| 830 |
+
MATCH (n) WHERE n.community_id IS NOT NULL
|
| 831 |
+
MATCH (c:Community {id: n.community_id})
|
| 832 |
+
MERGE (n)-[:BELONGS_TO_COMMUNITY]->(c)
|
| 833 |
+
"""
|
| 834 |
+
await neo4j_connection.execute_query(community_rel_query, {})
|
| 835 |
+
|
| 836 |
+
except Exception as e:
|
| 837 |
+
error_msg = f"Community relationship creation error: {str(e)}"
|
| 838 |
+
logger.error(error_msg)
|
| 839 |
+
upload_stats["errors"].append(error_msg)
|
| 840 |
+
|
| 841 |
+
# Calculate success percentage for all components
|
| 842 |
+
nodes_success_rate = (upload_stats["nodes_created"] / len(nodes) * 100) if nodes else 100
|
| 843 |
+
rels_success_rate = (upload_stats["relationships_created"] / len(relationships) * 100) if relationships else 100
|
| 844 |
+
communities_success_rate = (upload_stats["communities_created"] / communities_count * 100) if communities_count else 100
|
| 845 |
+
drift_success_rate = (upload_stats["drift_metadata_created"] / drift_count * 100) if drift_count else 100
|
| 846 |
+
|
| 847 |
+
embedding_dimensions = 0
|
| 848 |
+
if drift_metadata and 'community_search_index' in drift_metadata:
|
| 849 |
+
for comm_data in drift_metadata['community_search_index'].values():
|
| 850 |
+
embeddings = comm_data.get('embeddings', {})
|
| 851 |
+
summary_embedding = embeddings.get('summary_embedding', [])
|
| 852 |
+
if summary_embedding:
|
| 853 |
+
embedding_dimensions = len(summary_embedding)
|
| 854 |
+
break
|
| 855 |
+
|
| 856 |
+
total_created = (upload_stats["nodes_created"] + upload_stats["relationships_created"] +
|
| 857 |
+
upload_stats["communities_created"] + upload_stats["drift_metadata_created"] +
|
| 858 |
+
upload_stats["global_metadata_created"] + upload_stats["search_optimization_created"] +
|
| 859 |
+
upload_stats["communities_metadata_created"] + upload_stats["drift_config_created"] +
|
| 860 |
+
upload_stats["community_search_index_created"] + upload_stats["search_optimization_object_created"] +
|
| 861 |
+
upload_stats["embeddings_object_created"] + upload_stats["embeddings_created"])
|
| 862 |
+
overall_success_rate = (total_created / total_items * 100) if total_items else 100
|
| 863 |
+
|
| 864 |
+
result = {
|
| 865 |
+
"status": "success",
|
| 866 |
+
"message": "Graph data upload completed successfully",
|
| 867 |
+
"statistics": upload_stats,
|
| 868 |
+
"success_rates": {
|
| 869 |
+
"nodes": f"{nodes_success_rate:.1f}%",
|
| 870 |
+
"relationships": f"{rels_success_rate:.1f}%",
|
| 871 |
+
"communities": f"{communities_success_rate:.1f}%",
|
| 872 |
+
"drift_metadata": f"{drift_success_rate:.1f}%",
|
| 873 |
+
"global_metadata": f"{100.0 if upload_stats['global_metadata_created'] > 0 else 0:.1f}%",
|
| 874 |
+
"search_optimization": f"{100.0 if upload_stats['search_optimization_created'] > 0 else 0:.1f}%",
|
| 875 |
+
"communities_metadata": f"{100.0 if upload_stats['communities_metadata_created'] > 0 else 0:.1f}%",
|
| 876 |
+
"drift_config": f"{100.0 if upload_stats['drift_config_created'] > 0 else 0:.1f}%",
|
| 877 |
+
"community_search_index": f"{100.0 if upload_stats['community_search_index_created'] > 0 else 0:.1f}%",
|
| 878 |
+
"search_optimization_object": f"{100.0 if upload_stats['search_optimization_object_created'] > 0 else 0:.1f}%",
|
| 879 |
+
"embeddings_object": f"{100.0 if upload_stats['embeddings_object_created'] > 0 else 0:.1f}%",
|
| 880 |
+
"embeddings": f"{(upload_stats['embeddings_created']/communities_count*100) if communities_count > 0 else 0:.1f}%",
|
| 881 |
+
"overall": f"{overall_success_rate:.1f}%"
|
| 882 |
+
},
|
| 883 |
+
"source_file": GRAPH_DATA_FILE,
|
| 884 |
+
"architecture_summary": {
|
| 885 |
+
"nodes": f"{upload_stats['nodes_created']}/{len(nodes)}",
|
| 886 |
+
"relationships": f"{upload_stats['relationships_created']}/{len(relationships)}",
|
| 887 |
+
"communities": f"{upload_stats['communities_created']}/{communities_count}",
|
| 888 |
+
"drift_metadata": f"{upload_stats['drift_metadata_created']}/{drift_count}",
|
| 889 |
+
"global_metadata": f"{upload_stats['global_metadata_created']}/1",
|
| 890 |
+
"search_optimization": f"{upload_stats['search_optimization_created']}/1",
|
| 891 |
+
"embeddings_stored": communities_count > 0,
|
| 892 |
+
"vector_dimensions": embedding_dimensions,
|
| 893 |
+
"complete_metadata_coverage": upload_stats['global_metadata_created'] > 0 and upload_stats['search_optimization_created'] > 0
|
| 894 |
+
}
|
| 895 |
+
}
|
| 896 |
+
|
| 897 |
+
# Print concise upload summary
|
| 898 |
+
print(f"\n✅ Upload completed: {total_created}/{total_items} items ({overall_success_rate:.1f}%)")
|
| 899 |
+
|
| 900 |
+
# Show all node types created
|
| 901 |
+
total_entity_nodes = upload_stats['nodes_created']
|
| 902 |
+
total_metadata_nodes = (upload_stats['drift_metadata_created'] +
|
| 903 |
+
upload_stats['global_metadata_created'] +
|
| 904 |
+
upload_stats['search_optimization_created'] +
|
| 905 |
+
upload_stats['communities_metadata_created'] +
|
| 906 |
+
upload_stats['drift_config_created'] +
|
| 907 |
+
upload_stats['community_search_index_created'] +
|
| 908 |
+
upload_stats['search_optimization_object_created'] +
|
| 909 |
+
upload_stats['embeddings_object_created'])
|
| 910 |
+
|
| 911 |
+
print(f" Entity Nodes: {total_entity_nodes}, Community Nodes: {upload_stats['communities_created']}, Metadata Nodes: {total_metadata_nodes}, Embedding Nodes: {upload_stats['embeddings_created']}")
|
| 912 |
+
print(f" Relationships: {upload_stats['relationships_created']}")
|
| 913 |
+
|
| 914 |
+
if upload_stats['errors']:
|
| 915 |
+
print(f" ⚠️ {len(upload_stats['errors'])} errors encountered")
|
| 916 |
+
|
| 917 |
+
return result
|
| 918 |
+
|
| 919 |
+
except Exception as e:
|
| 920 |
+
logger.error(f"Graph data upload failed: {str(e)}")
|
| 921 |
+
return {
|
| 922 |
+
"status": "error",
|
| 923 |
+
"error": str(e)
|
| 924 |
+
}
|
| 925 |
+
|
| 926 |
+
|
| 927 |
+
@app.tool()
|
| 928 |
+
async def upload_graph_data() -> Dict[str, Any]:
|
| 929 |
+
return await upload_graph_data_impl()
|
| 930 |
+
|
| 931 |
+
|
| 932 |
+
@app.tool()
|
| 933 |
+
async def check_graph_data_file() -> Dict[str, Any]:
|
| 934 |
+
try:
|
| 935 |
+
if not os.path.exists(GRAPH_DATA_FILE):
|
| 936 |
+
return {
|
| 937 |
+
"status": "not_found",
|
| 938 |
+
"path": GRAPH_DATA_FILE,
|
| 939 |
+
"message": "Graph data file does not exist"
|
| 940 |
+
}
|
| 941 |
+
|
| 942 |
+
# Get file stats
|
| 943 |
+
file_stats = os.stat(GRAPH_DATA_FILE)
|
| 944 |
+
file_size = file_stats.st_size
|
| 945 |
+
|
| 946 |
+
# Try to parse the JSON to validate format
|
| 947 |
+
try:
|
| 948 |
+
with open(GRAPH_DATA_FILE, 'r', encoding='utf-8') as f:
|
| 949 |
+
graph_data = json.load(f)
|
| 950 |
+
|
| 951 |
+
nodes_count = len(graph_data.get('nodes', []))
|
| 952 |
+
relationships_count = len(graph_data.get('relationships', []))
|
| 953 |
+
|
| 954 |
+
return {
|
| 955 |
+
"status": "found",
|
| 956 |
+
"path": GRAPH_DATA_FILE,
|
| 957 |
+
"file_size_bytes": file_size,
|
| 958 |
+
"nodes_count": nodes_count,
|
| 959 |
+
"relationships_count": relationships_count,
|
| 960 |
+
"valid_json": True
|
| 961 |
+
}
|
| 962 |
+
|
| 963 |
+
except json.JSONDecodeError as e:
|
| 964 |
+
return {
|
| 965 |
+
"status": "invalid",
|
| 966 |
+
"path": GRAPH_DATA_FILE,
|
| 967 |
+
"file_size_bytes": file_size,
|
| 968 |
+
"valid_json": False,
|
| 969 |
+
"json_error": str(e)
|
| 970 |
+
}
|
| 971 |
+
|
| 972 |
+
except Exception as e:
|
| 973 |
+
return {
|
| 974 |
+
"status": "error",
|
| 975 |
+
"error": str(e),
|
| 976 |
+
"error_type": type(e).__name__
|
| 977 |
+
}
|
| 978 |
+
|
| 979 |
+
|
| 980 |
+
@app.tool()
|
| 981 |
+
async def get_connection_info() -> Dict[str, Any]:
|
| 982 |
+
try:
|
| 983 |
+
# Always return configuration info even if not connected
|
| 984 |
+
deployment_type = "Self-hosted"
|
| 985 |
+
if "databases.neo4j.io" in neo4j_connection.uri:
|
| 986 |
+
deployment_type = "Neo4j Aura"
|
| 987 |
+
elif "sandbox" in neo4j_connection.uri:
|
| 988 |
+
deployment_type = "Neo4j Sandbox"
|
| 989 |
+
elif any(cloud in neo4j_connection.uri for cloud in ["aws", "gcp", "azure"]):
|
| 990 |
+
deployment_type = "Enterprise Cloud"
|
| 991 |
+
|
| 992 |
+
connection_info = {
|
| 993 |
+
"status": "success",
|
| 994 |
+
"connection": {
|
| 995 |
+
"uri": neo4j_connection.uri,
|
| 996 |
+
"database": neo4j_connection.database,
|
| 997 |
+
"username": neo4j_connection.username,
|
| 998 |
+
"deployment_type": deployment_type,
|
| 999 |
+
"ssl_enabled": neo4j_connection.uri.startswith(('neo4j+s://', 'bolt+s://')),
|
| 1000 |
+
"connected": neo4j_connection.driver is not None
|
| 1001 |
+
},
|
| 1002 |
+
"capabilities": {
|
| 1003 |
+
"cypher_queries": True,
|
| 1004 |
+
"schema_inspection": True,
|
| 1005 |
+
"bulk_operations": True,
|
| 1006 |
+
"graph_algorithms": "unknown",
|
| 1007 |
+
"multi_database": "unknown"
|
| 1008 |
+
}
|
| 1009 |
+
}
|
| 1010 |
+
|
| 1011 |
+
if neo4j_connection.driver:
|
| 1012 |
+
try:
|
| 1013 |
+
server_info_records, _ = await neo4j_connection.execute_query(
|
| 1014 |
+
"CALL dbms.components() YIELD name, versions, edition"
|
| 1015 |
+
)
|
| 1016 |
+
connection_info["server_info"] = server_info_records[0] if server_info_records else {}
|
| 1017 |
+
except Exception:
|
| 1018 |
+
connection_info["server_info"] = {}
|
| 1019 |
+
|
| 1020 |
+
return connection_info
|
| 1021 |
+
|
| 1022 |
+
except Exception as e:
|
| 1023 |
+
logger.error(f"Connection info retrieval failed: {str(e)}")
|
| 1024 |
+
return {
|
| 1025 |
+
"status": "error",
|
| 1026 |
+
"error": str(e),
|
| 1027 |
+
"error_type": type(e).__name__
|
| 1028 |
+
}
|
| 1029 |
+
|
| 1030 |
+
|
| 1031 |
+
if __name__ == "__main__":
|
| 1032 |
+
import sys
|
| 1033 |
+
try:
|
| 1034 |
+
asyncio.run(neo4j_connection.connect())
|
| 1035 |
+
print(f"Looking for graph data at: {GRAPH_DATA_FILE}")
|
| 1036 |
+
print(f"File exists: {os.path.exists(GRAPH_DATA_FILE)}")
|
| 1037 |
+
|
| 1038 |
+
result = asyncio.run(upload_graph_data_impl())
|
| 1039 |
+
print(f"Upload result: {result.get('status', 'unknown')}")
|
| 1040 |
+
|
| 1041 |
+
if result.get('status') == 'error':
|
| 1042 |
+
print(f"❌ Error details: {result.get('error', 'Unknown error')}")
|
| 1043 |
+
if 'error_type' in result:
|
| 1044 |
+
print(f"Error type: {result['error_type']}")
|
| 1045 |
+
|
| 1046 |
+
except ValueError as e:
|
| 1047 |
+
logger.error(f"Configuration Error: {e}")
|
| 1048 |
+
sys.exit(1)
|
| 1049 |
+
except Exception as e:
|
| 1050 |
+
logger.warning(f"Connection Warning: {e}")
|
4_query copy.py
ADDED
|
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from my_config import MY_CONFIG
|
| 3 |
+
|
| 4 |
+
# If connection to https://huggingface.co/ failed, uncomment the following path
|
| 5 |
+
os.environ['HF_ENDPOINT'] = MY_CONFIG.HF_ENDPOINT
|
| 6 |
+
|
| 7 |
+
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
| 8 |
+
from llama_index.core import Settings
|
| 9 |
+
from llama_index.core import VectorStoreIndex, StorageContext
|
| 10 |
+
from llama_index.vector_stores.milvus import MilvusVectorStore
|
| 11 |
+
from llama_index.core import VectorStoreIndex
|
| 12 |
+
from dotenv import load_dotenv
|
| 13 |
+
from llama_index.llms.litellm import LiteLLM
|
| 14 |
+
import query_utils
|
| 15 |
+
import time
|
| 16 |
+
import logging
|
| 17 |
+
import json
|
| 18 |
+
|
| 19 |
+
logging.basicConfig(
|
| 20 |
+
level=logging.INFO,
|
| 21 |
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
| 22 |
+
handlers=[
|
| 23 |
+
logging.FileHandler('logs/query/query_log.txt', mode='a'), # Save to file
|
| 24 |
+
logging.StreamHandler() # Also show in console
|
| 25 |
+
],
|
| 26 |
+
force=True
|
| 27 |
+
)
|
| 28 |
+
logger = logging.getLogger(__name__)
|
| 29 |
+
logger.setLevel(logging.INFO)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def run_query(query: str):
|
| 33 |
+
global query_engine
|
| 34 |
+
logger.info (f"-----------------------------------")
|
| 35 |
+
start_time = time.time()
|
| 36 |
+
query = query_utils.tweak_query(query, MY_CONFIG.LLM_MODEL)
|
| 37 |
+
logger.info (f"\nProcessing Query:\n{query}")
|
| 38 |
+
|
| 39 |
+
# Get initial vector response
|
| 40 |
+
vector_response = query_engine.query(query)
|
| 41 |
+
vector_text = str(vector_response).strip()
|
| 42 |
+
|
| 43 |
+
# Structured prompt
|
| 44 |
+
structured_prompt = f"""Please provide a comprehensive, well-structured answer using the provided document information.
|
| 45 |
+
|
| 46 |
+
Question: {query}
|
| 47 |
+
|
| 48 |
+
Document Information:
|
| 49 |
+
{vector_text}
|
| 50 |
+
|
| 51 |
+
Instructions:
|
| 52 |
+
1. Provide accurate, factual information based on the documents
|
| 53 |
+
2. Structure your response clearly with proper formatting
|
| 54 |
+
3. Be comprehensive yet concise
|
| 55 |
+
4. Highlight key relationships and important details when relevant
|
| 56 |
+
5. Use bullet points or sections when appropriate for clarity
|
| 57 |
+
|
| 58 |
+
Please provide your answer:"""
|
| 59 |
+
|
| 60 |
+
# Use structured prompt for final synthesis
|
| 61 |
+
res = query_engine.query(structured_prompt)
|
| 62 |
+
|
| 63 |
+
end_time = time.time()
|
| 64 |
+
total_time = end_time - start_time
|
| 65 |
+
logger.info ( "-------"
|
| 66 |
+
+ f"\nResponse:\n{res}"
|
| 67 |
+
+ f"\n\n⏱️ Total time: {total_time:.1f} seconds"
|
| 68 |
+
+ f"\n\nResponse Metadata:\n{json.dumps(res.metadata, indent=2)}"
|
| 69 |
+
# + f"\nSource Nodes: {[node.node_id for node in res.source_nodes]}"
|
| 70 |
+
)
|
| 71 |
+
logger.info (f"-----------------------------------")
|
| 72 |
+
|
| 73 |
+
# Save response and metadata to files
|
| 74 |
+
_save_query_files(query, res, total_time)
|
| 75 |
+
|
| 76 |
+
return res
|
| 77 |
+
|
| 78 |
+
def _save_query_files(query: str, response, total_time: float):
|
| 79 |
+
"""Save query response and metadata to files."""
|
| 80 |
+
import time
|
| 81 |
+
timestamp = time.strftime('%Y-%m-%d %H:%M:%S')
|
| 82 |
+
|
| 83 |
+
try:
|
| 84 |
+
# Save response to file
|
| 85 |
+
with open('logs/query/query_responses.txt', 'a', encoding='utf-8') as f:
|
| 86 |
+
f.write(f"\n{'='*80}\n")
|
| 87 |
+
f.write(f"QUERY [{timestamp}]: {query}\n")
|
| 88 |
+
f.write(f"{'='*80}\n")
|
| 89 |
+
f.write(f"RESPONSE: {response}\n")
|
| 90 |
+
f.write(f"TIME: {total_time:.1f} seconds\n")
|
| 91 |
+
f.write(f"{'='*80}\n\n")
|
| 92 |
+
|
| 93 |
+
# Save metadata to file
|
| 94 |
+
with open('logs/query/query_metadata.txt', 'a', encoding='utf-8') as f:
|
| 95 |
+
f.write(f"\n{'='*80}\n")
|
| 96 |
+
f.write(f"METADATA [{timestamp}]: {query}\n")
|
| 97 |
+
f.write(f"{'='*80}\n")
|
| 98 |
+
f.write(f"TIME: {total_time:.1f} seconds\n")
|
| 99 |
+
f.write(json.dumps(response.metadata, indent=2, default=str))
|
| 100 |
+
f.write(f"\n{'='*80}\n\n")
|
| 101 |
+
|
| 102 |
+
logger.info(f"Saved response and metadata for query: {query[:50]}...")
|
| 103 |
+
except Exception as e:
|
| 104 |
+
logger.error(f"Failed to save query files: {e}")
|
| 105 |
+
|
| 106 |
+
## ======= end : run_query =======
|
| 107 |
+
|
| 108 |
+
## load env config
|
| 109 |
+
load_dotenv()
|
| 110 |
+
|
| 111 |
+
# Setup embeddings
|
| 112 |
+
Settings.embed_model = HuggingFaceEmbedding(
|
| 113 |
+
model_name = MY_CONFIG.EMBEDDING_MODEL
|
| 114 |
+
)
|
| 115 |
+
logger.info (f"✅ Using embedding model: {MY_CONFIG.EMBEDDING_MODEL}")
|
| 116 |
+
|
| 117 |
+
# Connect to Vector RAG only database
|
| 118 |
+
vector_store = MilvusVectorStore(
|
| 119 |
+
uri = MY_CONFIG.MILVUS_URI_VECTOR, # Use dedicated Vector-only database
|
| 120 |
+
dim = MY_CONFIG.EMBEDDING_LENGTH,
|
| 121 |
+
collection_name = MY_CONFIG.COLLECTION_NAME,
|
| 122 |
+
overwrite=False # so we load the index from db
|
| 123 |
+
)
|
| 124 |
+
storage_context = StorageContext.from_defaults(vector_store=vector_store)
|
| 125 |
+
logger.info (f"✅ Connected to Vector-only Milvus instance: {MY_CONFIG.MILVUS_URI_VECTOR}")
|
| 126 |
+
|
| 127 |
+
# Load Document Index from DB
|
| 128 |
+
|
| 129 |
+
index = VectorStoreIndex.from_vector_store(
|
| 130 |
+
vector_store=vector_store, storage_context=storage_context)
|
| 131 |
+
logger.info (f"✅ Loaded Vector-only index from: {MY_CONFIG.MILVUS_URI_VECTOR}")
|
| 132 |
+
|
| 133 |
+
# Setup LLM
|
| 134 |
+
logger.info (f"✅ Using LLM model : {MY_CONFIG.LLM_MODEL}")
|
| 135 |
+
Settings.llm = LiteLLM (
|
| 136 |
+
model=MY_CONFIG.LLM_MODEL,
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
query_engine = index.as_query_engine()
|
| 140 |
+
|
| 141 |
+
# Sample queries
|
| 142 |
+
queries = [
|
| 143 |
+
# "What is AI Alliance?",
|
| 144 |
+
# "What are the main focus areas of AI Alliance?",
|
| 145 |
+
# "What are some ai alliance projects?",
|
| 146 |
+
# "What are the upcoming events?",
|
| 147 |
+
# "How do I join the AI Alliance?",
|
| 148 |
+
# "When was the moon landing?",
|
| 149 |
+
]
|
| 150 |
+
|
| 151 |
+
for query in queries:
|
| 152 |
+
run_query(query)
|
| 153 |
+
|
| 154 |
+
logger.info (f"-----------------------------------")
|
| 155 |
+
|
| 156 |
+
while True:
|
| 157 |
+
# Get user input
|
| 158 |
+
user_query = input("\nEnter your question (or 'q' to exit): ")
|
| 159 |
+
|
| 160 |
+
# Check if user wants to quit
|
| 161 |
+
if user_query.lower() in ['quit', 'exit', 'q']:
|
| 162 |
+
logger.info ("Goodbye!")
|
| 163 |
+
break
|
| 164 |
+
|
| 165 |
+
# Process the query
|
| 166 |
+
if user_query.strip() == "":
|
| 167 |
+
continue
|
| 168 |
+
|
| 169 |
+
try:
|
| 170 |
+
run_query(user_query)
|
| 171 |
+
except Exception as e:
|
| 172 |
+
logger.error(f"Error processing query: {e}")
|
| 173 |
+
print(f"Error processing query: {e}")
|
4_query.ipynb
ADDED
|
@@ -0,0 +1,398 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"metadata": {},
|
| 6 |
+
"source": [
|
| 7 |
+
"# RAG on HTML documents\n"
|
| 8 |
+
]
|
| 9 |
+
},
|
| 10 |
+
{
|
| 11 |
+
"cell_type": "markdown",
|
| 12 |
+
"metadata": {},
|
| 13 |
+
"source": [
|
| 14 |
+
"## Step-1: Configuration"
|
| 15 |
+
]
|
| 16 |
+
},
|
| 17 |
+
{
|
| 18 |
+
"cell_type": "code",
|
| 19 |
+
"execution_count": 1,
|
| 20 |
+
"metadata": {},
|
| 21 |
+
"outputs": [],
|
| 22 |
+
"source": [
|
| 23 |
+
"from my_config import MY_CONFIG"
|
| 24 |
+
]
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"cell_type": "markdown",
|
| 28 |
+
"metadata": {},
|
| 29 |
+
"source": [
|
| 30 |
+
"## Step-2: Setup Embeddings"
|
| 31 |
+
]
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"cell_type": "code",
|
| 35 |
+
"execution_count": 2,
|
| 36 |
+
"metadata": {},
|
| 37 |
+
"outputs": [],
|
| 38 |
+
"source": [
|
| 39 |
+
"# If connection to https://huggingface.co/ failed, uncomment the following path\n",
|
| 40 |
+
"import os\n",
|
| 41 |
+
"os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'"
|
| 42 |
+
]
|
| 43 |
+
},
|
| 44 |
+
{
|
| 45 |
+
"cell_type": "code",
|
| 46 |
+
"execution_count": 3,
|
| 47 |
+
"metadata": {},
|
| 48 |
+
"outputs": [
|
| 49 |
+
{
|
| 50 |
+
"name": "stderr",
|
| 51 |
+
"output_type": "stream",
|
| 52 |
+
"text": [
|
| 53 |
+
"/home/sujee/my-stuff/projects/ai-alliance/allycat-1/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
| 54 |
+
" from .autonotebook import tqdm as notebook_tqdm\n"
|
| 55 |
+
]
|
| 56 |
+
}
|
| 57 |
+
],
|
| 58 |
+
"source": [
|
| 59 |
+
"from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n",
|
| 60 |
+
"from llama_index.core import Settings\n",
|
| 61 |
+
"\n",
|
| 62 |
+
"Settings.embed_model = HuggingFaceEmbedding(\n",
|
| 63 |
+
" model_name = MY_CONFIG.EMBEDDING_MODEL\n",
|
| 64 |
+
")"
|
| 65 |
+
]
|
| 66 |
+
},
|
| 67 |
+
{
|
| 68 |
+
"cell_type": "markdown",
|
| 69 |
+
"metadata": {},
|
| 70 |
+
"source": [
|
| 71 |
+
"## Step-3: Connect to Milvus"
|
| 72 |
+
]
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"cell_type": "code",
|
| 76 |
+
"execution_count": 4,
|
| 77 |
+
"metadata": {},
|
| 78 |
+
"outputs": [
|
| 79 |
+
{
|
| 80 |
+
"name": "stderr",
|
| 81 |
+
"output_type": "stream",
|
| 82 |
+
"text": [
|
| 83 |
+
"/home/sujee/my-stuff/projects/ai-alliance/allycat-1/.venv/lib/python3.11/site-packages/milvus_lite/__init__.py:15: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.\n",
|
| 84 |
+
" from pkg_resources import DistributionNotFound, get_distribution\n",
|
| 85 |
+
"2025-07-14 00:23:38,214 [DEBUG][_create_connection]: Created new connection using: async-workspace/rag_website_milvus.db (async_milvus_client.py:599)\n"
|
| 86 |
+
]
|
| 87 |
+
},
|
| 88 |
+
{
|
| 89 |
+
"name": "stdout",
|
| 90 |
+
"output_type": "stream",
|
| 91 |
+
"text": [
|
| 92 |
+
"✅ Connected to Milvus instance: workspace/rag_website_milvus.db\n"
|
| 93 |
+
]
|
| 94 |
+
}
|
| 95 |
+
],
|
| 96 |
+
"source": [
|
| 97 |
+
"# connect to vector db\n",
|
| 98 |
+
"from llama_index.core import VectorStoreIndex, StorageContext\n",
|
| 99 |
+
"from llama_index.vector_stores.milvus import MilvusVectorStore\n",
|
| 100 |
+
"\n",
|
| 101 |
+
"vector_store = MilvusVectorStore(\n",
|
| 102 |
+
" uri = MY_CONFIG.DB_URI ,\n",
|
| 103 |
+
" dim = MY_CONFIG.EMBEDDING_LENGTH , \n",
|
| 104 |
+
" collection_name = MY_CONFIG.COLLECTION_NAME,\n",
|
| 105 |
+
" overwrite=False # so we load the index from db\n",
|
| 106 |
+
")\n",
|
| 107 |
+
"storage_context = StorageContext.from_defaults(vector_store=vector_store)\n",
|
| 108 |
+
"\n",
|
| 109 |
+
"print (\"✅ Connected to Milvus instance: \", MY_CONFIG.DB_URI )"
|
| 110 |
+
]
|
| 111 |
+
},
|
| 112 |
+
{
|
| 113 |
+
"cell_type": "markdown",
|
| 114 |
+
"metadata": {},
|
| 115 |
+
"source": [
|
| 116 |
+
"## Step-4: Load Document Index from DB"
|
| 117 |
+
]
|
| 118 |
+
},
|
| 119 |
+
{
|
| 120 |
+
"cell_type": "code",
|
| 121 |
+
"execution_count": 5,
|
| 122 |
+
"metadata": {},
|
| 123 |
+
"outputs": [
|
| 124 |
+
{
|
| 125 |
+
"name": "stdout",
|
| 126 |
+
"output_type": "stream",
|
| 127 |
+
"text": [
|
| 128 |
+
"✅ Loaded index from vector db: workspace/rag_website_milvus.db\n",
|
| 129 |
+
"CPU times: user 109 ms, sys: 16.8 ms, total: 126 ms\n",
|
| 130 |
+
"Wall time: 123 ms\n"
|
| 131 |
+
]
|
| 132 |
+
}
|
| 133 |
+
],
|
| 134 |
+
"source": [
|
| 135 |
+
"%%time\n",
|
| 136 |
+
"\n",
|
| 137 |
+
"from llama_index.core import VectorStoreIndex\n",
|
| 138 |
+
"\n",
|
| 139 |
+
"index = VectorStoreIndex.from_vector_store(\n",
|
| 140 |
+
" vector_store=vector_store, storage_context=storage_context)\n",
|
| 141 |
+
"\n",
|
| 142 |
+
"print (\"✅ Loaded index from vector db:\", MY_CONFIG.DB_URI )"
|
| 143 |
+
]
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"cell_type": "markdown",
|
| 147 |
+
"metadata": {},
|
| 148 |
+
"source": [
|
| 149 |
+
"## Step-5: Setup LLM"
|
| 150 |
+
]
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"cell_type": "code",
|
| 154 |
+
"execution_count": 6,
|
| 155 |
+
"metadata": {},
|
| 156 |
+
"outputs": [
|
| 157 |
+
{
|
| 158 |
+
"name": "stdout",
|
| 159 |
+
"output_type": "stream",
|
| 160 |
+
"text": [
|
| 161 |
+
"✅ Using LLM model : ollama/gemma3:1b\n"
|
| 162 |
+
]
|
| 163 |
+
}
|
| 164 |
+
],
|
| 165 |
+
"source": [
|
| 166 |
+
"from llama_index.llms.litellm import LiteLLM\n",
|
| 167 |
+
"\n",
|
| 168 |
+
"# Setup LLM\n",
|
| 169 |
+
"print (f\"✅ Using LLM model : {MY_CONFIG.LLM_MODEL}\")\n",
|
| 170 |
+
"Settings.llm = LiteLLM (\n",
|
| 171 |
+
" model=MY_CONFIG.LLM_MODEL,\n",
|
| 172 |
+
" )"
|
| 173 |
+
]
|
| 174 |
+
},
|
| 175 |
+
{
|
| 176 |
+
"cell_type": "markdown",
|
| 177 |
+
"metadata": {},
|
| 178 |
+
"source": [
|
| 179 |
+
"## Step-6: Query"
|
| 180 |
+
]
|
| 181 |
+
},
|
| 182 |
+
{
|
| 183 |
+
"cell_type": "code",
|
| 184 |
+
"execution_count": 7,
|
| 185 |
+
"metadata": {},
|
| 186 |
+
"outputs": [
|
| 187 |
+
{
|
| 188 |
+
"name": "stdout",
|
| 189 |
+
"output_type": "stream",
|
| 190 |
+
"text": [
|
| 191 |
+
"The AI Alliance is an international community of researchers, developers, and organizational leaders committed to fostering open innovation across the AI technology landscape to accelerate progress, improve safety, security, diversity and economic competitiveness in AI.\n"
|
| 192 |
+
]
|
| 193 |
+
}
|
| 194 |
+
],
|
| 195 |
+
"source": [
|
| 196 |
+
"import query_utils\n",
|
| 197 |
+
"\n",
|
| 198 |
+
"query_engine = index.as_query_engine()\n",
|
| 199 |
+
"query = query_utils.tweak_query('What is AI Alliance?', MY_CONFIG.LLM_MODEL)\n",
|
| 200 |
+
"res = query_engine.query(query)\n",
|
| 201 |
+
"print(res)"
|
| 202 |
+
]
|
| 203 |
+
},
|
| 204 |
+
{
|
| 205 |
+
"cell_type": "code",
|
| 206 |
+
"execution_count": 8,
|
| 207 |
+
"metadata": {},
|
| 208 |
+
"outputs": [
|
| 209 |
+
{
|
| 210 |
+
"name": "stdout",
|
| 211 |
+
"output_type": "stream",
|
| 212 |
+
"text": [
|
| 213 |
+
"The AI Alliance is focused on fostering an open community and enabling developers and researchers to accelerate responsible innovation in AI while ensuring scientific rigor, trust, safety, security, diversity and economic competitiveness.\n"
|
| 214 |
+
]
|
| 215 |
+
}
|
| 216 |
+
],
|
| 217 |
+
"source": [
|
| 218 |
+
"query_engine = index.as_query_engine()\n",
|
| 219 |
+
"query = query_utils.tweak_query('What are the main focus areas of AI Alliance?', MY_CONFIG.LLM_MODEL)\n",
|
| 220 |
+
"res = query_engine.query(query)\n",
|
| 221 |
+
"print(res)"
|
| 222 |
+
]
|
| 223 |
+
},
|
| 224 |
+
{
|
| 225 |
+
"cell_type": "code",
|
| 226 |
+
"execution_count": 9,
|
| 227 |
+
"metadata": {},
|
| 228 |
+
"outputs": [
|
| 229 |
+
{
|
| 230 |
+
"name": "stdout",
|
| 231 |
+
"output_type": "stream",
|
| 232 |
+
"text": [
|
| 233 |
+
"Based on the provided text, here are some of the AI Alliance projects mentioned:\n",
|
| 234 |
+
"\n",
|
| 235 |
+
"* FPT Software\n",
|
| 236 |
+
"* Hebrew University of Jerusalem\n",
|
| 237 |
+
"* Hugging Face\n",
|
| 238 |
+
"* IBM\n",
|
| 239 |
+
"* Abdus Salam International Centre for Theoretical Physics (ICTP)\n",
|
| 240 |
+
"* Imperial College London\n",
|
| 241 |
+
"* Indian Institute of Technology Bombay\n",
|
| 242 |
+
"* Institute for Computer Science, Artificial Intelligence\n",
|
| 243 |
+
"* Intel\n",
|
| 244 |
+
"* Keio University\n",
|
| 245 |
+
"* LangChain\n",
|
| 246 |
+
"* LlamaIndex\n",
|
| 247 |
+
"* Linux Foundation\n",
|
| 248 |
+
"* Mass Open Cloud Alliance, operated by Boston University and Harvard\n",
|
| 249 |
+
"* Meta\n",
|
| 250 |
+
"* Mohamed bin Zayed University of Artificial Intelligence\n",
|
| 251 |
+
"* MLCommons\n",
|
| 252 |
+
"* National Aeronautics and Space Administration\n",
|
| 253 |
+
"* National Science Foundation\n",
|
| 254 |
+
"* New York University\n",
|
| 255 |
+
"* NumFOCUS\n",
|
| 256 |
+
"* OpenTeams\n",
|
| 257 |
+
"* Oracle\n",
|
| 258 |
+
"* Partnership on AI\n",
|
| 259 |
+
"* Quansight\n",
|
| 260 |
+
"* Red Hat\n",
|
| 261 |
+
"* Rensselaer Polytechnic Institute\n",
|
| 262 |
+
"* Roadzen\n",
|
| 263 |
+
"* Sakana AI\n",
|
| 264 |
+
"* SB Intuitions\n",
|
| 265 |
+
"* ServiceNow\n",
|
| 266 |
+
"* Silo AI\n",
|
| 267 |
+
"* Simons Foundation\n",
|
| 268 |
+
"* Sony Group\n",
|
| 269 |
+
"* Stability AI\n",
|
| 270 |
+
"* Together AI\n",
|
| 271 |
+
"* TU Munich\n",
|
| 272 |
+
"* UC Berkeley College of Computing, Data Science, and Society\n",
|
| 273 |
+
"* University of Illinois Urbana-Champaign\n",
|
| 274 |
+
"* The University of Notre Dame\n",
|
| 275 |
+
"* The University of Texas at Austin\n",
|
| 276 |
+
"* The University of Tokyo\n"
|
| 277 |
+
]
|
| 278 |
+
}
|
| 279 |
+
],
|
| 280 |
+
"source": [
|
| 281 |
+
"query_engine = index.as_query_engine()\n",
|
| 282 |
+
"query = query_utils.tweak_query('What are some ai alliance projects?', MY_CONFIG.LLM_MODEL)\n",
|
| 283 |
+
"res = query_engine.query(query)\n",
|
| 284 |
+
"print(res)"
|
| 285 |
+
]
|
| 286 |
+
},
|
| 287 |
+
{
|
| 288 |
+
"cell_type": "code",
|
| 289 |
+
"execution_count": 10,
|
| 290 |
+
"metadata": {},
|
| 291 |
+
"outputs": [
|
| 292 |
+
{
|
| 293 |
+
"name": "stdout",
|
| 294 |
+
"output_type": "stream",
|
| 295 |
+
"text": [
|
| 296 |
+
"On August 8th, The AI Alliance hosted Open Source AI Demo Night in San Francisco.\n"
|
| 297 |
+
]
|
| 298 |
+
}
|
| 299 |
+
],
|
| 300 |
+
"source": [
|
| 301 |
+
"query_engine = index.as_query_engine()\n",
|
| 302 |
+
"query = query_utils.tweak_query('Where was the demo night held?', MY_CONFIG.LLM_MODEL)\n",
|
| 303 |
+
"res = query_engine.query(query)\n",
|
| 304 |
+
"print(res)"
|
| 305 |
+
]
|
| 306 |
+
},
|
| 307 |
+
{
|
| 308 |
+
"cell_type": "code",
|
| 309 |
+
"execution_count": 11,
|
| 310 |
+
"metadata": {},
|
| 311 |
+
"outputs": [
|
| 312 |
+
{
|
| 313 |
+
"name": "stdout",
|
| 314 |
+
"output_type": "stream",
|
| 315 |
+
"text": [
|
| 316 |
+
"The AI Alliance is focused on developing and sharing foundational models for science.\n"
|
| 317 |
+
]
|
| 318 |
+
}
|
| 319 |
+
],
|
| 320 |
+
"source": [
|
| 321 |
+
"query_engine = index.as_query_engine()\n",
|
| 322 |
+
"query = query_utils.tweak_query('What is the AI Alliance doing in the area of material science?', MY_CONFIG.LLM_MODEL)\n",
|
| 323 |
+
"res = query_engine.query(query)\n",
|
| 324 |
+
"print(res)"
|
| 325 |
+
]
|
| 326 |
+
},
|
| 327 |
+
{
|
| 328 |
+
"cell_type": "code",
|
| 329 |
+
"execution_count": 12,
|
| 330 |
+
"metadata": {},
|
| 331 |
+
"outputs": [
|
| 332 |
+
{
|
| 333 |
+
"name": "stdout",
|
| 334 |
+
"output_type": "stream",
|
| 335 |
+
"text": [
|
| 336 |
+
"By submitting this form, you agree that the AI Alliance will collect and process the personal information you provide to keep you informed about AI Alliance initiatives and enable your involvement in AI Alliance activities. Additionally, you agree that the AI Alliance may share the personal information you provide with its member organizations so that they may communicate with you about AI Alliance initiatives and your involvement in AI Alliance activities.\n",
|
| 337 |
+
"\n",
|
| 338 |
+
"You may withdraw your consent for the processing of your personal information by the AI Alliance. Please contact us to request a permanent deletion.\n"
|
| 339 |
+
]
|
| 340 |
+
}
|
| 341 |
+
],
|
| 342 |
+
"source": [
|
| 343 |
+
"query_engine = index.as_query_engine()\n",
|
| 344 |
+
"query = query_utils.tweak_query('How do I join the AI Alliance?', MY_CONFIG.LLM_MODEL)\n",
|
| 345 |
+
"res = query_engine.query(query)\n",
|
| 346 |
+
"print(res)"
|
| 347 |
+
]
|
| 348 |
+
},
|
| 349 |
+
{
|
| 350 |
+
"cell_type": "code",
|
| 351 |
+
"execution_count": 13,
|
| 352 |
+
"metadata": {},
|
| 353 |
+
"outputs": [
|
| 354 |
+
{
|
| 355 |
+
"name": "stdout",
|
| 356 |
+
"output_type": "stream",
|
| 357 |
+
"text": [
|
| 358 |
+
"The context does not provide information about the moon landing.\n"
|
| 359 |
+
]
|
| 360 |
+
}
|
| 361 |
+
],
|
| 362 |
+
"source": [
|
| 363 |
+
"query_engine = index.as_query_engine()\n",
|
| 364 |
+
"query = query_utils.tweak_query('When was the moon landing?', MY_CONFIG.LLM_MODEL)\n",
|
| 365 |
+
"res = query_engine.query(query)\n",
|
| 366 |
+
"print(res)"
|
| 367 |
+
]
|
| 368 |
+
},
|
| 369 |
+
{
|
| 370 |
+
"cell_type": "code",
|
| 371 |
+
"execution_count": null,
|
| 372 |
+
"metadata": {},
|
| 373 |
+
"outputs": [],
|
| 374 |
+
"source": []
|
| 375 |
+
}
|
| 376 |
+
],
|
| 377 |
+
"metadata": {
|
| 378 |
+
"kernelspec": {
|
| 379 |
+
"display_name": "allycat-1",
|
| 380 |
+
"language": "python",
|
| 381 |
+
"name": "python3"
|
| 382 |
+
},
|
| 383 |
+
"language_info": {
|
| 384 |
+
"codemirror_mode": {
|
| 385 |
+
"name": "ipython",
|
| 386 |
+
"version": 3
|
| 387 |
+
},
|
| 388 |
+
"file_extension": ".py",
|
| 389 |
+
"mimetype": "text/x-python",
|
| 390 |
+
"name": "python",
|
| 391 |
+
"nbconvert_exporter": "python",
|
| 392 |
+
"pygments_lexer": "ipython3",
|
| 393 |
+
"version": "3.11.12"
|
| 394 |
+
}
|
| 395 |
+
},
|
| 396 |
+
"nbformat": 4,
|
| 397 |
+
"nbformat_minor": 4
|
| 398 |
+
}
|
4_query.py
ADDED
|
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Vector RAG Query
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
from my_config import MY_CONFIG
|
| 7 |
+
|
| 8 |
+
# If connection to https://huggingface.co/ failed, uncomment the following path
|
| 9 |
+
os.environ['HF_ENDPOINT'] = MY_CONFIG.HF_ENDPOINT
|
| 10 |
+
|
| 11 |
+
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
| 12 |
+
from llama_index.core import Settings
|
| 13 |
+
from llama_index.core import VectorStoreIndex, StorageContext
|
| 14 |
+
from llama_index.vector_stores.milvus import MilvusVectorStore
|
| 15 |
+
from dotenv import load_dotenv
|
| 16 |
+
from llama_index.llms.litellm import LiteLLM
|
| 17 |
+
import query_utils
|
| 18 |
+
import time
|
| 19 |
+
import logging
|
| 20 |
+
import json
|
| 21 |
+
|
| 22 |
+
# Create logs directory if it doesn't exist
|
| 23 |
+
os.makedirs('logs/query', exist_ok=True)
|
| 24 |
+
|
| 25 |
+
logging.basicConfig(
|
| 26 |
+
level=logging.INFO,
|
| 27 |
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
| 28 |
+
handlers=[
|
| 29 |
+
logging.FileHandler('logs/query/query_log.txt', mode='a'), # Save to file
|
| 30 |
+
logging.StreamHandler() # Also show in console
|
| 31 |
+
],
|
| 32 |
+
force=True
|
| 33 |
+
)
|
| 34 |
+
logger = logging.getLogger(__name__)
|
| 35 |
+
logger.setLevel(logging.INFO)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def run_query(query: str):
|
| 39 |
+
global query_engine
|
| 40 |
+
logger.info("-----------------------------------")
|
| 41 |
+
start_time = time.time()
|
| 42 |
+
query = query_utils.tweak_query(query, MY_CONFIG.LLM_MODEL)
|
| 43 |
+
logger.info (f"\nProcessing Query:\n{query}")
|
| 44 |
+
|
| 45 |
+
# Get initial vector response
|
| 46 |
+
vector_response = query_engine.query(query)
|
| 47 |
+
vector_text = str(vector_response).strip()
|
| 48 |
+
|
| 49 |
+
# Structured prompt
|
| 50 |
+
structured_prompt = f"""Please provide a comprehensive, well-structured answer using the provided document information.
|
| 51 |
+
|
| 52 |
+
Question: {query}
|
| 53 |
+
|
| 54 |
+
Document Information:
|
| 55 |
+
{vector_text}
|
| 56 |
+
|
| 57 |
+
Instructions:
|
| 58 |
+
1. Provide accurate, factual information based on the documents
|
| 59 |
+
2. Structure your response clearly with proper formatting
|
| 60 |
+
3. Be comprehensive yet concise
|
| 61 |
+
4. Highlight key relationships and important details when relevant
|
| 62 |
+
5. Use bullet points or sections when appropriate for clarity
|
| 63 |
+
|
| 64 |
+
Please provide your answer:"""
|
| 65 |
+
|
| 66 |
+
# Use structured prompt for final synthesis
|
| 67 |
+
res = query_engine.query(structured_prompt)
|
| 68 |
+
|
| 69 |
+
end_time = time.time()
|
| 70 |
+
total_time = end_time - start_time
|
| 71 |
+
logger.info ( "-------"
|
| 72 |
+
+ f"\nResponse:\n{res}"
|
| 73 |
+
+ f"\n\n⏱️ Total time: {total_time:.1f} seconds"
|
| 74 |
+
+ f"\n\nResponse Metadata:\n{json.dumps(res.metadata, indent=2)}"
|
| 75 |
+
+ f"\nSource Nodes: {[node.node_id for node in res.source_nodes]}"
|
| 76 |
+
)
|
| 77 |
+
logger.info("-----------------------------------")
|
| 78 |
+
|
| 79 |
+
# Save response and metadata to files
|
| 80 |
+
_save_query_files(query, res, total_time)
|
| 81 |
+
|
| 82 |
+
return res
|
| 83 |
+
|
| 84 |
+
def _save_query_files(query: str, response, total_time: float):
|
| 85 |
+
"""Save query response and metadata to files."""
|
| 86 |
+
import time
|
| 87 |
+
timestamp = time.strftime('%Y-%m-%d %H:%M:%S')
|
| 88 |
+
|
| 89 |
+
try:
|
| 90 |
+
# Save response to file
|
| 91 |
+
with open('logs/query/query_responses.txt', 'a', encoding='utf-8') as f:
|
| 92 |
+
f.write(f"\n{'='*80}\n")
|
| 93 |
+
f.write(f"QUERY [{timestamp}]: {query}\n")
|
| 94 |
+
f.write(f"{'='*80}\n")
|
| 95 |
+
f.write(f"RESPONSE: {response}\n")
|
| 96 |
+
f.write(f"TIME: {total_time:.1f} seconds\n")
|
| 97 |
+
f.write(f"{'='*80}\n\n")
|
| 98 |
+
|
| 99 |
+
# Save metadata to file
|
| 100 |
+
with open('logs/query/query_metadata.txt', 'a', encoding='utf-8') as f:
|
| 101 |
+
f.write(f"\n{'='*80}\n")
|
| 102 |
+
f.write(f"METADATA [{timestamp}]: {query}\n")
|
| 103 |
+
f.write(f"{'='*80}\n")
|
| 104 |
+
f.write(f"TIME: {total_time:.1f} seconds\n")
|
| 105 |
+
f.write(json.dumps(response.metadata, indent=2, default=str))
|
| 106 |
+
f.write(f"\n{'='*80}\n\n")
|
| 107 |
+
|
| 108 |
+
logger.info(f"Saved response and metadata for query: {query[:50]}...")
|
| 109 |
+
except Exception as e:
|
| 110 |
+
logger.error(f"Failed to save query files: {e}")
|
| 111 |
+
|
| 112 |
+
## ======= end : run_query =======
|
| 113 |
+
|
| 114 |
+
## load env config
|
| 115 |
+
load_dotenv()
|
| 116 |
+
|
| 117 |
+
# Setup embeddings
|
| 118 |
+
Settings.embed_model = HuggingFaceEmbedding(
|
| 119 |
+
model_name = MY_CONFIG.EMBEDDING_MODEL
|
| 120 |
+
)
|
| 121 |
+
logger.info (f"✅ Using embedding model: {MY_CONFIG.EMBEDDING_MODEL}")
|
| 122 |
+
|
| 123 |
+
# Connect to vector database based on configuration
|
| 124 |
+
if MY_CONFIG.VECTOR_DB_TYPE == "cloud_zilliz":
|
| 125 |
+
# Use Zilliz Cloud
|
| 126 |
+
if not MY_CONFIG.ZILLIZ_CLUSTER_ENDPOINT or not MY_CONFIG.ZILLIZ_TOKEN:
|
| 127 |
+
raise ValueError("Cloud database configuration missing. Set ZILLIZ_CLUSTER_ENDPOINT and ZILLIZ_TOKEN in .env")
|
| 128 |
+
|
| 129 |
+
vector_store = MilvusVectorStore(
|
| 130 |
+
uri=MY_CONFIG.ZILLIZ_CLUSTER_ENDPOINT,
|
| 131 |
+
token=MY_CONFIG.ZILLIZ_TOKEN,
|
| 132 |
+
dim=MY_CONFIG.EMBEDDING_LENGTH,
|
| 133 |
+
collection_name=MY_CONFIG.COLLECTION_NAME,
|
| 134 |
+
overwrite=False
|
| 135 |
+
)
|
| 136 |
+
storage_context = StorageContext.from_defaults(vector_store=vector_store)
|
| 137 |
+
logger.info("Connected to cloud vector database")
|
| 138 |
+
else:
|
| 139 |
+
# Use local Milvus (default)
|
| 140 |
+
vector_store = MilvusVectorStore(
|
| 141 |
+
uri=MY_CONFIG.MILVUS_URI_VECTOR,
|
| 142 |
+
dim=MY_CONFIG.EMBEDDING_LENGTH,
|
| 143 |
+
collection_name=MY_CONFIG.COLLECTION_NAME,
|
| 144 |
+
overwrite=False
|
| 145 |
+
)
|
| 146 |
+
storage_context = StorageContext.from_defaults(vector_store=vector_store)
|
| 147 |
+
logger.info("Connected to local vector database")
|
| 148 |
+
|
| 149 |
+
# Load Document Index from database
|
| 150 |
+
index = VectorStoreIndex.from_vector_store(
|
| 151 |
+
vector_store=vector_store, storage_context=storage_context)
|
| 152 |
+
logger.info("Vector index loaded successfully")
|
| 153 |
+
|
| 154 |
+
# Setup LLM
|
| 155 |
+
logger.info (f"✅ Using LLM model : {MY_CONFIG.LLM_MODEL}")
|
| 156 |
+
Settings.llm = LiteLLM (
|
| 157 |
+
model=MY_CONFIG.LLM_MODEL,
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
query_engine = index.as_query_engine()
|
| 161 |
+
|
| 162 |
+
# Sample queries
|
| 163 |
+
queries = [
|
| 164 |
+
# "What is AI Alliance?",
|
| 165 |
+
# "What are the main focus areas of AI Alliance?",
|
| 166 |
+
# "What are some ai alliance projects?",
|
| 167 |
+
# "What are the upcoming events?",
|
| 168 |
+
# "How do I join the AI Alliance?",
|
| 169 |
+
# "When was the moon landing?",
|
| 170 |
+
]
|
| 171 |
+
|
| 172 |
+
for query in queries:
|
| 173 |
+
run_query(query)
|
| 174 |
+
|
| 175 |
+
logger.info("-----------------------------------")
|
| 176 |
+
|
| 177 |
+
while True:
|
| 178 |
+
# Get user input
|
| 179 |
+
user_query = input("\nEnter your question (or 'q' to exit): ")
|
| 180 |
+
|
| 181 |
+
# Check if user wants to quit
|
| 182 |
+
if user_query.lower() in ['quit', 'exit', 'q']:
|
| 183 |
+
logger.info ("Goodbye!")
|
| 184 |
+
break
|
| 185 |
+
|
| 186 |
+
# Process the query
|
| 187 |
+
if user_query.strip() == "":
|
| 188 |
+
continue
|
| 189 |
+
|
| 190 |
+
try:
|
| 191 |
+
run_query(user_query)
|
| 192 |
+
except Exception as e:
|
| 193 |
+
logger.error(f"Error processing query: {e}")
|
| 194 |
+
print(f"Error processing query: {e}")
|
4b_query_graph copy.py
ADDED
|
@@ -0,0 +1,338 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
GraphRAG Implementation - Main Query Engine
|
| 3 |
+
|
| 4 |
+
Imports Step 1 functionality from query-graph-functions/setup.py
|
| 5 |
+
and implements the complete 25-step DRIFT search methodology.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import time
|
| 9 |
+
import logging
|
| 10 |
+
import json
|
| 11 |
+
import importlib
|
| 12 |
+
import sys
|
| 13 |
+
import os
|
| 14 |
+
import asyncio
|
| 15 |
+
from typing import Dict, Any
|
| 16 |
+
|
| 17 |
+
# Apply nest_asyncio to allow nested event loops
|
| 18 |
+
import nest_asyncio
|
| 19 |
+
nest_asyncio.apply()
|
| 20 |
+
|
| 21 |
+
# Import Step 1 functionality from setup module
|
| 22 |
+
from query_graph_functions.setup import create_graphrag_setup
|
| 23 |
+
# Import Steps 3-5 functionality from query preprocessing module
|
| 24 |
+
from query_graph_functions.query_preprocessing import (
|
| 25 |
+
create_query_preprocessor,
|
| 26 |
+
preprocess_query_pipeline
|
| 27 |
+
)
|
| 28 |
+
# Import Steps 6-8 functionality from knowledge retrieval module
|
| 29 |
+
from query_graph_functions.knowledge_retrieval import CommunitySearchEngine
|
| 30 |
+
# Import Steps 9-12 functionality from follow-up search module
|
| 31 |
+
from query_graph_functions.follow_up_search import FollowUpSearch
|
| 32 |
+
# Import Steps 13-14 functionality from vector augmentation module
|
| 33 |
+
from query_graph_functions.vector_augmentation import VectorAugmentationEngine
|
| 34 |
+
# Import Steps 15-16 functionality from answer synthesis module
|
| 35 |
+
from query_graph_functions.answer_synthesis import AnswerSynthesisEngine
|
| 36 |
+
# Import Steps 17-20 functionality from response management module
|
| 37 |
+
from query_graph_functions.response_management import ResponseManager
|
| 38 |
+
from my_config import MY_CONFIG
|
| 39 |
+
import query_utils
|
| 40 |
+
|
| 41 |
+
# Configure logging - Save to file and console
|
| 42 |
+
logging.basicConfig(
|
| 43 |
+
level=logging.INFO,
|
| 44 |
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
| 45 |
+
handlers=[
|
| 46 |
+
logging.FileHandler('logs/graphrag_query/graphrag_query_log.txt', mode='a'), # Save to file
|
| 47 |
+
logging.StreamHandler() # Also show in console
|
| 48 |
+
],
|
| 49 |
+
force=True
|
| 50 |
+
)
|
| 51 |
+
logger = logging.getLogger(__name__)
|
| 52 |
+
logger.setLevel(logging.INFO)
|
| 53 |
+
|
| 54 |
+
# Log session start
|
| 55 |
+
logger.info("=" * 80)
|
| 56 |
+
logger.info(f"GraphRAG Session Started - {time.strftime('%Y-%m-%d %H:%M:%S')}")
|
| 57 |
+
logger.info("=" * 80)
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
class GraphQueryEngine:
|
| 61 |
+
"""
|
| 62 |
+
GraphRAG Query Engine - Complete Implementation
|
| 63 |
+
|
| 64 |
+
Uses setup module for Step 1 initialization and query preprocessing
|
| 65 |
+
module for Steps 3-5, implementing the full 25-step DRIFT search methodology.
|
| 66 |
+
"""
|
| 67 |
+
|
| 68 |
+
def __init__(self):
|
| 69 |
+
logger.info("GraphRAG Query Engine Initializing")
|
| 70 |
+
|
| 71 |
+
# Initialize using setup module (Step 1)
|
| 72 |
+
self.setup = create_graphrag_setup()
|
| 73 |
+
|
| 74 |
+
# Extract components from setup
|
| 75 |
+
self.neo4j_conn = self.setup.neo4j_conn
|
| 76 |
+
self.query_engine = self.setup.query_engine
|
| 77 |
+
self.graph_stats = self.setup.graph_stats
|
| 78 |
+
self.drift_config = self.setup.drift_config
|
| 79 |
+
self.llm = self.setup.llm
|
| 80 |
+
self.config = self.setup.config
|
| 81 |
+
|
| 82 |
+
# Initialize query preprocessor (Steps 3-5) - will be created async
|
| 83 |
+
self.query_preprocessor = None
|
| 84 |
+
|
| 85 |
+
# Initialize response manager (Steps 17-20)
|
| 86 |
+
self.response_manager = ResponseManager(self.setup)
|
| 87 |
+
|
| 88 |
+
logger.info("GraphRAG Query Engine Ready")
|
| 89 |
+
|
| 90 |
+
async def run_query_async(self, user_query: str) -> Dict[str, Any]:
|
| 91 |
+
"""
|
| 92 |
+
GraphRAG Query Pipeline - Main Entry Point (Async)
|
| 93 |
+
|
| 94 |
+
Implements Phase B (Steps 3-5) of the 25-step DRIFT search methodology
|
| 95 |
+
"""
|
| 96 |
+
logger.info("=" * 60)
|
| 97 |
+
logger.info("GraphRAG Query Pipeline Starting")
|
| 98 |
+
logger.info("=" * 60)
|
| 99 |
+
|
| 100 |
+
start_time = time.time()
|
| 101 |
+
|
| 102 |
+
# Apply query optimization
|
| 103 |
+
optimized_query = query_utils.tweak_query(user_query, MY_CONFIG.LLM_MODEL)
|
| 104 |
+
logger.info(f"Original Query: {user_query}")
|
| 105 |
+
if optimized_query != user_query:
|
| 106 |
+
logger.info(f"Optimized Query: {optimized_query}")
|
| 107 |
+
|
| 108 |
+
try:
|
| 109 |
+
# Validate system readiness using setup module
|
| 110 |
+
if not self.setup.validate_system_readiness():
|
| 111 |
+
return self._generate_error_response("System not properly initialized")
|
| 112 |
+
|
| 113 |
+
# PHASE B: QUERY PREPROCESSING (Steps 3-5)
|
| 114 |
+
logger.info("Phase B: Starting Query Preprocessing (Steps 3-5)")
|
| 115 |
+
|
| 116 |
+
# Initialize query preprocessor if needed
|
| 117 |
+
if not self.query_preprocessor:
|
| 118 |
+
self.query_preprocessor = await create_query_preprocessor(
|
| 119 |
+
self.config, self.graph_stats
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
# Execute complete preprocessing pipeline
|
| 123 |
+
analysis, routing, vectorization = await preprocess_query_pipeline(
|
| 124 |
+
optimized_query, self.config, self.graph_stats
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
logger.info(f"Phase B Completed: "
|
| 128 |
+
f"Type={analysis.query_type.value}, "
|
| 129 |
+
f"Strategy={routing.search_strategy.value}")
|
| 130 |
+
|
| 131 |
+
# PHASE C: COMMUNITY RETRIEVAL (Steps 6-7)
|
| 132 |
+
logger.info("Phase C: Starting Community Retrieval (Steps 6-7)")
|
| 133 |
+
|
| 134 |
+
# Create community search engine
|
| 135 |
+
community_engine = CommunitySearchEngine(self.setup)
|
| 136 |
+
|
| 137 |
+
# Execute the primer phase (Steps 6-8)
|
| 138 |
+
community_results = await community_engine.execute_primer_phase(
|
| 139 |
+
vectorization.embedding, routing
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
# Extract communities for Phase D
|
| 143 |
+
communities = community_results['communities']
|
| 144 |
+
|
| 145 |
+
logger.info(f"Phase C Completed: Retrieved {len(communities)} communities")
|
| 146 |
+
|
| 147 |
+
# PHASE D: FOLLOW-UP SEARCH (Steps 9-12)
|
| 148 |
+
logger.info("Phase D: Starting Follow-up Search (Steps 9-12)")
|
| 149 |
+
|
| 150 |
+
# Create follow-up search engine
|
| 151 |
+
follow_up_engine = FollowUpSearch(self.setup)
|
| 152 |
+
|
| 153 |
+
# Execute follow-up search phase
|
| 154 |
+
follow_up_results = await follow_up_engine.execute_follow_up_phase(
|
| 155 |
+
community_results, routing
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
logger.info(f"Phase D Completed: Generated {len(follow_up_results.get('intermediate_answers', []))} detailed answers")
|
| 159 |
+
|
| 160 |
+
# PHASE E: VECTOR SEARCH AUGMENTATION (Steps 13-14)
|
| 161 |
+
logger.info("Phase E: Starting Vector Search Augmentation (Steps 13-14)")
|
| 162 |
+
|
| 163 |
+
# Create vector augmentation engine
|
| 164 |
+
vector_engine = VectorAugmentationEngine(self.setup)
|
| 165 |
+
|
| 166 |
+
# Execute vector augmentation phase
|
| 167 |
+
augmentation_results = await vector_engine.execute_vector_augmentation_phase(
|
| 168 |
+
vectorization.embedding,
|
| 169 |
+
{'communities': communities, 'initial_answer': community_results['initial_answer'], 'follow_up_results': follow_up_results},
|
| 170 |
+
routing
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
logger.info(f"Phase E Completed: Vector augmentation confidence: {augmentation_results.augmentation_confidence:.3f}")
|
| 174 |
+
|
| 175 |
+
# PHASE F: ANSWER SYNTHESIS (Steps 15-16)
|
| 176 |
+
logger.info("Phase F: Starting Answer Synthesis (Steps 15-16)")
|
| 177 |
+
|
| 178 |
+
# Create answer synthesis engine
|
| 179 |
+
synthesis_engine = AnswerSynthesisEngine(self.setup)
|
| 180 |
+
|
| 181 |
+
# Execute comprehensive answer synthesis
|
| 182 |
+
synthesis_results = await synthesis_engine.execute_answer_synthesis_phase(
|
| 183 |
+
analysis, routing, community_results, follow_up_results, augmentation_results
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
logger.info(f"Phase F Completed: Final synthesis confidence: {synthesis_results.confidence_score:.3f}")
|
| 187 |
+
|
| 188 |
+
# PHASE G: RESPONSE MANAGEMENT (Steps 17-20)
|
| 189 |
+
logger.info("Phase G: Starting Response Management (Steps 17-20)")
|
| 190 |
+
|
| 191 |
+
# Enhanced implementation using preprocessing results
|
| 192 |
+
if self.query_engine:
|
| 193 |
+
# Use the vectorized query for better results
|
| 194 |
+
_ = self.query_engine.query(vectorization.normalized_query)
|
| 195 |
+
total_time = time.time() - start_time
|
| 196 |
+
|
| 197 |
+
logger.info(f"Enhanced Query Completed in {total_time:.2f}s")
|
| 198 |
+
logger.info("=" * 60)
|
| 199 |
+
|
| 200 |
+
# Use Phase F synthesis result as the final answer
|
| 201 |
+
enhanced_answer = synthesis_results.final_answer
|
| 202 |
+
|
| 203 |
+
# Generate comprehensive metadata using ResponseManager
|
| 204 |
+
metadata = self.response_manager.generate_metadata(
|
| 205 |
+
analysis=analysis,
|
| 206 |
+
routing=routing,
|
| 207 |
+
vectorization=vectorization,
|
| 208 |
+
community_results=community_results,
|
| 209 |
+
follow_up_results=follow_up_results,
|
| 210 |
+
augmentation_results=augmentation_results,
|
| 211 |
+
synthesis_results=synthesis_results,
|
| 212 |
+
total_time=total_time,
|
| 213 |
+
graph_stats=self.graph_stats,
|
| 214 |
+
config=self.config
|
| 215 |
+
)
|
| 216 |
+
|
| 217 |
+
result = {
|
| 218 |
+
"answer": enhanced_answer,
|
| 219 |
+
"metadata": metadata
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
# Save response and metadata to files using ResponseManager
|
| 223 |
+
self.response_manager.save_response_to_files(user_query, result)
|
| 224 |
+
|
| 225 |
+
logger.info("Phase G Completed: Response management finished")
|
| 226 |
+
|
| 227 |
+
return result
|
| 228 |
+
else:
|
| 229 |
+
return await synthesis_engine.generate_error_response("Query engine not available")
|
| 230 |
+
|
| 231 |
+
except Exception as e:
|
| 232 |
+
logger.error(f"Query Pipeline Failed: {e}")
|
| 233 |
+
return await synthesis_engine.generate_error_response(f"Query processing error: {e}")
|
| 234 |
+
|
| 235 |
+
def run_query(self, user_query: str) -> Dict[str, Any]:
|
| 236 |
+
"""
|
| 237 |
+
Synchronous wrapper for async query processing.
|
| 238 |
+
|
| 239 |
+
This maintains backward compatibility while using the new async pipeline.
|
| 240 |
+
Uses nest_asyncio and our LiteLLM patch to properly handle async tasks.
|
| 241 |
+
"""
|
| 242 |
+
try:
|
| 243 |
+
# Use the current event loop since nest_asyncio.apply() has been called
|
| 244 |
+
loop = asyncio.get_event_loop()
|
| 245 |
+
|
| 246 |
+
# Create a future to gather all tasks and wait for completion
|
| 247 |
+
async def run_with_cleanup():
|
| 248 |
+
try:
|
| 249 |
+
# Run the main query
|
| 250 |
+
result = await self.run_query_async(user_query)
|
| 251 |
+
|
| 252 |
+
# Use setup module's cleanup function
|
| 253 |
+
await self.setup.cleanup_async_tasks(timeout=2.0)
|
| 254 |
+
|
| 255 |
+
return result
|
| 256 |
+
except Exception as e:
|
| 257 |
+
logger.error(f"Async Query Execution Failed: {e}")
|
| 258 |
+
raise e
|
| 259 |
+
|
| 260 |
+
# Run the async function with cleanup
|
| 261 |
+
return loop.run_until_complete(run_with_cleanup())
|
| 262 |
+
|
| 263 |
+
except Exception as e:
|
| 264 |
+
logger.error(f"Sync Query Wrapper Failed: {e}")
|
| 265 |
+
# Use synthesis engine for error handling
|
| 266 |
+
synthesis_engine = AnswerSynthesisEngine(self.setup)
|
| 267 |
+
loop = asyncio.get_event_loop()
|
| 268 |
+
return loop.run_until_complete(
|
| 269 |
+
synthesis_engine.generate_error_response(f"Query processing error: {e}")
|
| 270 |
+
)
|
| 271 |
+
|
| 272 |
+
def close(self):
|
| 273 |
+
"""Clean up connections using setup module"""
|
| 274 |
+
if self.setup:
|
| 275 |
+
self.setup.close()
|
| 276 |
+
logger.info("GraphQueryEngine cleanup complete")
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
if __name__ == "__main__":
|
| 280 |
+
print("GraphRAG Implementation - Hot Reload Enabled")
|
| 281 |
+
print("=" * 50)
|
| 282 |
+
print("Step 1: Initialization and Connection")
|
| 283 |
+
print("Hot Reload: Type 'r' to reload modules")
|
| 284 |
+
print("=" * 50)
|
| 285 |
+
|
| 286 |
+
engine = GraphQueryEngine()
|
| 287 |
+
|
| 288 |
+
try:
|
| 289 |
+
# Create an event loop for the main thread
|
| 290 |
+
loop = asyncio.new_event_loop()
|
| 291 |
+
asyncio.set_event_loop(loop)
|
| 292 |
+
|
| 293 |
+
while True:
|
| 294 |
+
user_query = input("\nEnter your question ('q' to exit, 'r' to reload): ")
|
| 295 |
+
|
| 296 |
+
if user_query.lower() in ['quit', 'exit', 'q']:
|
| 297 |
+
print("Goodbye!")
|
| 298 |
+
break
|
| 299 |
+
|
| 300 |
+
if user_query.lower() == 'r':
|
| 301 |
+
print("Reloading...")
|
| 302 |
+
engine.close()
|
| 303 |
+
|
| 304 |
+
# Run cleanup tasks before reloading using setup module
|
| 305 |
+
loop.run_until_complete(
|
| 306 |
+
engine.setup.cleanup_async_tasks(timeout=3.0) if engine.setup else None
|
| 307 |
+
)
|
| 308 |
+
|
| 309 |
+
engine = GraphQueryEngine()
|
| 310 |
+
print("Reloaded!")
|
| 311 |
+
continue
|
| 312 |
+
|
| 313 |
+
if user_query.strip() == "":
|
| 314 |
+
continue
|
| 315 |
+
|
| 316 |
+
# Direct method call - clean forward-only implementation
|
| 317 |
+
result = engine.run_query(user_query)
|
| 318 |
+
|
| 319 |
+
# Print results
|
| 320 |
+
print("\n" + "=" * 60)
|
| 321 |
+
print("GraphRAG Query Results")
|
| 322 |
+
print("=" * 60)
|
| 323 |
+
print(f"Answer: {result['answer']}")
|
| 324 |
+
print(f"\nMetadata: {json.dumps(result['metadata'], indent=2)}")
|
| 325 |
+
print("=" * 60)
|
| 326 |
+
|
| 327 |
+
except Exception as e:
|
| 328 |
+
logger.error(f"Error processing query: {e}")
|
| 329 |
+
print(f"Error processing query: {e}")
|
| 330 |
+
finally:
|
| 331 |
+
# Run final cleanup before exiting using setup module
|
| 332 |
+
if 'loop' in locals() and 'engine' in locals():
|
| 333 |
+
loop.run_until_complete(
|
| 334 |
+
engine.setup.cleanup_async_tasks(timeout=5.0) if engine.setup else None
|
| 335 |
+
)
|
| 336 |
+
loop.close()
|
| 337 |
+
if 'engine' in locals():
|
| 338 |
+
engine.close()
|
4b_query_graph.py
ADDED
|
@@ -0,0 +1,327 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
GraphRAG Query
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import time
|
| 6 |
+
import logging
|
| 7 |
+
import json
|
| 8 |
+
import importlib
|
| 9 |
+
import sys
|
| 10 |
+
import os
|
| 11 |
+
import asyncio
|
| 12 |
+
from typing import Dict, Any
|
| 13 |
+
|
| 14 |
+
# Apply nest_asyncio to allow nested event loops
|
| 15 |
+
import nest_asyncio
|
| 16 |
+
nest_asyncio.apply()
|
| 17 |
+
|
| 18 |
+
# Import Step 1 functionality from setup module
|
| 19 |
+
from query_graph_functions.setup import create_graphrag_setup
|
| 20 |
+
# Import Steps 3-5 functionality from query preprocessing module
|
| 21 |
+
from query_graph_functions.query_preprocessing import (
|
| 22 |
+
create_query_preprocessor,
|
| 23 |
+
preprocess_query_pipeline
|
| 24 |
+
)
|
| 25 |
+
# Import Steps 6-8 functionality from knowledge retrieval module
|
| 26 |
+
from query_graph_functions.knowledge_retrieval import CommunitySearchEngine
|
| 27 |
+
# Import Steps 9-12 functionality from follow-up search module
|
| 28 |
+
from query_graph_functions.follow_up_search import FollowUpSearch
|
| 29 |
+
# Import Steps 13-14 functionality from vector augmentation module
|
| 30 |
+
from query_graph_functions.vector_augmentation import VectorAugmentationEngine
|
| 31 |
+
# Import Steps 15-16 functionality from answer synthesis module
|
| 32 |
+
from query_graph_functions.answer_synthesis import AnswerSynthesisEngine
|
| 33 |
+
# Import Steps 17-20 functionality from response management module
|
| 34 |
+
from query_graph_functions.response_management import ResponseManager
|
| 35 |
+
from my_config import MY_CONFIG
|
| 36 |
+
import query_utils
|
| 37 |
+
|
| 38 |
+
# Create logs directory if it doesn't exist
|
| 39 |
+
os.makedirs('logs/graphrag_query', exist_ok=True)
|
| 40 |
+
|
| 41 |
+
# Configure logging - Save to file and console
|
| 42 |
+
logging.basicConfig(
|
| 43 |
+
level=logging.INFO,
|
| 44 |
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
| 45 |
+
handlers=[
|
| 46 |
+
logging.FileHandler('logs/graphrag_query/graphrag_query_log.txt', mode='a'), # Save to file
|
| 47 |
+
logging.StreamHandler() # Also show in console
|
| 48 |
+
],
|
| 49 |
+
force=True
|
| 50 |
+
)
|
| 51 |
+
logger = logging.getLogger(__name__)
|
| 52 |
+
logger.setLevel(logging.INFO)
|
| 53 |
+
|
| 54 |
+
# Log session start
|
| 55 |
+
logger.info("=" * 80)
|
| 56 |
+
logger.info(f"GraphRAG Session Started - {time.strftime('%Y-%m-%d %H:%M:%S')}")
|
| 57 |
+
logger.info("=" * 80)
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
class GraphQueryEngine:
|
| 61 |
+
def __init__(self):
|
| 62 |
+
logger.info("GraphRAG Query Engine Initializing")
|
| 63 |
+
|
| 64 |
+
# Initialize using setup module (Step 1)
|
| 65 |
+
self.setup = create_graphrag_setup()
|
| 66 |
+
|
| 67 |
+
# Extract components from setup
|
| 68 |
+
self.neo4j_conn = self.setup.neo4j_conn
|
| 69 |
+
self.query_engine = self.setup.query_engine
|
| 70 |
+
self.graph_stats = self.setup.graph_stats
|
| 71 |
+
self.drift_config = self.setup.drift_config
|
| 72 |
+
self.llm = self.setup.llm
|
| 73 |
+
self.config = self.setup.config
|
| 74 |
+
|
| 75 |
+
# Initialize query preprocessor (Steps 3-5) - will be created async
|
| 76 |
+
self.query_preprocessor = None
|
| 77 |
+
|
| 78 |
+
# Initialize response manager (Steps 17-20)
|
| 79 |
+
self.response_manager = ResponseManager(self.setup)
|
| 80 |
+
|
| 81 |
+
logger.info("GraphRAG Query Engine Ready")
|
| 82 |
+
|
| 83 |
+
async def run_query_async(self, user_query: str) -> Dict[str, Any]:
|
| 84 |
+
"""
|
| 85 |
+
GraphRAG Query Pipeline - Main Entry Point (Async)
|
| 86 |
+
|
| 87 |
+
Implements Phase B (Steps 3-5) of the 25-step DRIFT search methodology
|
| 88 |
+
"""
|
| 89 |
+
logger.info("=" * 60)
|
| 90 |
+
logger.info("GraphRAG Query Pipeline Starting")
|
| 91 |
+
logger.info("=" * 60)
|
| 92 |
+
|
| 93 |
+
start_time = time.time()
|
| 94 |
+
|
| 95 |
+
# Apply query optimization
|
| 96 |
+
optimized_query = query_utils.tweak_query(user_query, MY_CONFIG.LLM_MODEL)
|
| 97 |
+
logger.info(f"Original Query: {user_query}")
|
| 98 |
+
if optimized_query != user_query:
|
| 99 |
+
logger.info(f"Optimized Query: {optimized_query}")
|
| 100 |
+
|
| 101 |
+
try:
|
| 102 |
+
# Validate system readiness using setup module
|
| 103 |
+
if not self.setup.validate_system_readiness():
|
| 104 |
+
return self._generate_error_response("System not properly initialized")
|
| 105 |
+
|
| 106 |
+
# PHASE B: QUERY PREPROCESSING (Steps 3-5)
|
| 107 |
+
logger.info("Phase B: Starting Query Preprocessing (Steps 3-5)")
|
| 108 |
+
|
| 109 |
+
# Initialize query preprocessor if needed
|
| 110 |
+
if not self.query_preprocessor:
|
| 111 |
+
self.query_preprocessor = await create_query_preprocessor(
|
| 112 |
+
self.config, self.graph_stats
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
# Execute complete preprocessing pipeline
|
| 116 |
+
analysis, routing, vectorization = await preprocess_query_pipeline(
|
| 117 |
+
optimized_query, self.config, self.graph_stats
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
logger.info(f"Phase B Completed: "
|
| 121 |
+
f"Type={analysis.query_type.value}, "
|
| 122 |
+
f"Strategy={routing.search_strategy.value}")
|
| 123 |
+
|
| 124 |
+
# PHASE C: COMMUNITY RETRIEVAL (Steps 6-7)
|
| 125 |
+
logger.info("Phase C: Starting Community Retrieval (Steps 6-7)")
|
| 126 |
+
|
| 127 |
+
# Create community search engine
|
| 128 |
+
community_engine = CommunitySearchEngine(self.setup)
|
| 129 |
+
|
| 130 |
+
# Execute the primer phase (Steps 6-8)
|
| 131 |
+
community_results = await community_engine.execute_primer_phase(
|
| 132 |
+
vectorization.embedding, routing
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
# Extract communities for Phase D
|
| 136 |
+
communities = community_results['communities']
|
| 137 |
+
|
| 138 |
+
logger.info(f"Phase C Completed: Retrieved {len(communities)} communities")
|
| 139 |
+
|
| 140 |
+
# PHASE D: FOLLOW-UP SEARCH (Steps 9-12)
|
| 141 |
+
logger.info("Phase D: Starting Follow-up Search (Steps 9-12)")
|
| 142 |
+
|
| 143 |
+
# Create follow-up search engine
|
| 144 |
+
follow_up_engine = FollowUpSearch(self.setup)
|
| 145 |
+
|
| 146 |
+
# Execute follow-up search phase
|
| 147 |
+
follow_up_results = await follow_up_engine.execute_follow_up_phase(
|
| 148 |
+
community_results, routing
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
logger.info(f"Phase D Completed: Generated {len(follow_up_results.get('intermediate_answers', []))} detailed answers")
|
| 152 |
+
|
| 153 |
+
# PHASE E: VECTOR SEARCH AUGMENTATION (Steps 13-14)
|
| 154 |
+
logger.info("Phase E: Starting Vector Search Augmentation (Steps 13-14)")
|
| 155 |
+
|
| 156 |
+
# Create vector augmentation engine
|
| 157 |
+
vector_engine = VectorAugmentationEngine(self.setup)
|
| 158 |
+
|
| 159 |
+
# Execute vector augmentation phase
|
| 160 |
+
augmentation_results = await vector_engine.execute_vector_augmentation_phase(
|
| 161 |
+
vectorization.embedding,
|
| 162 |
+
{'communities': communities, 'initial_answer': community_results['initial_answer'], 'follow_up_results': follow_up_results},
|
| 163 |
+
routing
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
logger.info(f"Phase E Completed: Vector augmentation confidence: {augmentation_results.augmentation_confidence:.3f}")
|
| 167 |
+
|
| 168 |
+
# PHASE F: ANSWER SYNTHESIS (Steps 15-16)
|
| 169 |
+
logger.info("Phase F: Starting Answer Synthesis (Steps 15-16)")
|
| 170 |
+
|
| 171 |
+
# Create answer synthesis engine
|
| 172 |
+
synthesis_engine = AnswerSynthesisEngine(self.setup)
|
| 173 |
+
|
| 174 |
+
# Execute comprehensive answer synthesis
|
| 175 |
+
synthesis_results = await synthesis_engine.execute_answer_synthesis_phase(
|
| 176 |
+
analysis, routing, community_results, follow_up_results, augmentation_results
|
| 177 |
+
)
|
| 178 |
+
|
| 179 |
+
logger.info(f"Phase F Completed: Final synthesis confidence: {synthesis_results.confidence_score:.3f}")
|
| 180 |
+
|
| 181 |
+
# PHASE G: RESPONSE MANAGEMENT (Steps 17-20)
|
| 182 |
+
logger.info("Phase G: Starting Response Management (Steps 17-20)")
|
| 183 |
+
|
| 184 |
+
# Enhanced implementation using preprocessing results
|
| 185 |
+
if self.query_engine:
|
| 186 |
+
# Use the vectorized query for better results
|
| 187 |
+
_ = self.query_engine.query(vectorization.normalized_query)
|
| 188 |
+
total_time = time.time() - start_time
|
| 189 |
+
|
| 190 |
+
logger.info(f"Enhanced Query Completed in {total_time:.2f}s")
|
| 191 |
+
logger.info("=" * 60)
|
| 192 |
+
|
| 193 |
+
# Use Phase F synthesis result as the final answer
|
| 194 |
+
enhanced_answer = synthesis_results.final_answer
|
| 195 |
+
|
| 196 |
+
# Generate comprehensive metadata using ResponseManager
|
| 197 |
+
metadata = self.response_manager.generate_comprehensive_metadata(
|
| 198 |
+
analysis=analysis,
|
| 199 |
+
routing=routing,
|
| 200 |
+
vectorization=vectorization,
|
| 201 |
+
community_results=community_results,
|
| 202 |
+
follow_up_results=follow_up_results,
|
| 203 |
+
augmentation_results=augmentation_results,
|
| 204 |
+
synthesis_results=synthesis_results,
|
| 205 |
+
total_time=total_time
|
| 206 |
+
)
|
| 207 |
+
|
| 208 |
+
result = {
|
| 209 |
+
"answer": enhanced_answer,
|
| 210 |
+
"metadata": metadata
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
# Save response and metadata to files using ResponseManager
|
| 214 |
+
self.response_manager.save_response_to_files(user_query, result)
|
| 215 |
+
|
| 216 |
+
logger.info("Phase G Completed: Response management finished")
|
| 217 |
+
|
| 218 |
+
return result
|
| 219 |
+
else:
|
| 220 |
+
return synthesis_engine.generate_error_response("Query engine not available")
|
| 221 |
+
|
| 222 |
+
except Exception as e:
|
| 223 |
+
logger.error(f"Query Pipeline Failed: {e}")
|
| 224 |
+
synthesis_engine = AnswerSynthesisEngine(self.setup)
|
| 225 |
+
return synthesis_engine.generate_error_response(f"Query processing error: {e}")
|
| 226 |
+
|
| 227 |
+
def run_query(self, user_query: str) -> Dict[str, Any]:
|
| 228 |
+
"""
|
| 229 |
+
Synchronous wrapper for async query processing.
|
| 230 |
+
|
| 231 |
+
This maintains backward compatibility while using the new async pipeline.
|
| 232 |
+
Uses nest_asyncio and our LiteLLM patch to properly handle async tasks.
|
| 233 |
+
"""
|
| 234 |
+
try:
|
| 235 |
+
# Use the current event loop since nest_asyncio.apply() has been called
|
| 236 |
+
loop = asyncio.get_event_loop()
|
| 237 |
+
|
| 238 |
+
# Create a future to gather all tasks and wait for completion
|
| 239 |
+
async def run_with_cleanup():
|
| 240 |
+
try:
|
| 241 |
+
# Run the main query
|
| 242 |
+
result = await self.run_query_async(user_query)
|
| 243 |
+
|
| 244 |
+
# Use setup module's cleanup function
|
| 245 |
+
await self.setup.cleanup_async_tasks(timeout=2.0)
|
| 246 |
+
|
| 247 |
+
return result
|
| 248 |
+
except Exception as e:
|
| 249 |
+
logger.error(f"Async Query Execution Failed: {e}")
|
| 250 |
+
raise e
|
| 251 |
+
|
| 252 |
+
# Run the async function with cleanup
|
| 253 |
+
return loop.run_until_complete(run_with_cleanup())
|
| 254 |
+
|
| 255 |
+
except Exception as e:
|
| 256 |
+
logger.error(f"Sync Query Wrapper Failed: {e}")
|
| 257 |
+
# Use synthesis engine for error handling
|
| 258 |
+
synthesis_engine = AnswerSynthesisEngine(self.setup)
|
| 259 |
+
return synthesis_engine.generate_error_response(f"Query processing error: {e}")
|
| 260 |
+
|
| 261 |
+
def close(self):
|
| 262 |
+
"""Clean up connections using setup module"""
|
| 263 |
+
if self.setup:
|
| 264 |
+
self.setup.close()
|
| 265 |
+
logger.info("GraphQueryEngine cleanup complete")
|
| 266 |
+
|
| 267 |
+
|
| 268 |
+
if __name__ == "__main__":
|
| 269 |
+
print("GraphRAG Implementation - Hot Reload Enabled")
|
| 270 |
+
print("=" * 50)
|
| 271 |
+
print("Step 1: Initialization and Connection")
|
| 272 |
+
print("Hot Reload: Type 'r' to reload modules")
|
| 273 |
+
print("=" * 50)
|
| 274 |
+
|
| 275 |
+
engine = GraphQueryEngine()
|
| 276 |
+
|
| 277 |
+
try:
|
| 278 |
+
# Create an event loop for the main thread
|
| 279 |
+
loop = asyncio.new_event_loop()
|
| 280 |
+
asyncio.set_event_loop(loop)
|
| 281 |
+
|
| 282 |
+
while True:
|
| 283 |
+
user_query = input("\nEnter your question ('q' to exit, 'r' to reload): ")
|
| 284 |
+
|
| 285 |
+
if user_query.lower() in ['quit', 'exit', 'q']:
|
| 286 |
+
print("Goodbye!")
|
| 287 |
+
break
|
| 288 |
+
|
| 289 |
+
if user_query.lower() == 'r':
|
| 290 |
+
print("Reloading...")
|
| 291 |
+
engine.close()
|
| 292 |
+
|
| 293 |
+
# Run cleanup tasks before reloading using setup module
|
| 294 |
+
loop.run_until_complete(
|
| 295 |
+
engine.setup.cleanup_async_tasks(timeout=3.0) if engine.setup else None
|
| 296 |
+
)
|
| 297 |
+
|
| 298 |
+
engine = GraphQueryEngine()
|
| 299 |
+
print("Reloaded!")
|
| 300 |
+
continue
|
| 301 |
+
|
| 302 |
+
if user_query.strip() == "":
|
| 303 |
+
continue
|
| 304 |
+
|
| 305 |
+
# Direct method call - clean forward-only implementation
|
| 306 |
+
result = engine.run_query(user_query)
|
| 307 |
+
|
| 308 |
+
# Print results
|
| 309 |
+
print("\n" + "=" * 60)
|
| 310 |
+
print("GraphRAG Query Results")
|
| 311 |
+
print("=" * 60)
|
| 312 |
+
print(f"Answer: {result['answer']}")
|
| 313 |
+
print(f"\nMetadata: {json.dumps(result['metadata'], indent=2)}")
|
| 314 |
+
print("=" * 60)
|
| 315 |
+
|
| 316 |
+
except Exception as e:
|
| 317 |
+
logger.error(f"Error processing query: {e}")
|
| 318 |
+
print(f"Error processing query: {e}")
|
| 319 |
+
finally:
|
| 320 |
+
# Run final cleanup before exiting using setup module
|
| 321 |
+
if 'loop' in locals() and 'engine' in locals():
|
| 322 |
+
loop.run_until_complete(
|
| 323 |
+
engine.setup.cleanup_async_tasks(timeout=5.0) if engine.setup else None
|
| 324 |
+
)
|
| 325 |
+
loop.close()
|
| 326 |
+
if 'engine' in locals():
|
| 327 |
+
engine.close()
|
CHANGELOG.md
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# AllyCat Changelog
|
| 2 |
+
|
| 3 |
+
All notable technical changes to AllyCat GraphRAG will be documented in this file.
|
| 4 |
+
|
| 5 |
+
## [Unreleased]
|
| 6 |
+
|
| 7 |
+
### Added
|
| 8 |
+
|
| 9 |
+
#### GraphRAG Implementation
|
| 10 |
+
- **GraphRAG Core System**: Implemented Microsoft GraphRAG-inspired architecture
|
| 11 |
+
- Entity extraction and relationship mapping from documents
|
| 12 |
+
- Community detection algorithms for knowledge graph clustering
|
| 13 |
+
- Multi-phase graph processing pipeline (phase 1: entities/relationships, phase 2: communities, phase 3: summaries)
|
| 14 |
+
- Graph-based query system with hierarchical summarization
|
| 15 |
+
- Neo4j integration for graph database storage (`3b_save_to_graph_db.py`)
|
| 16 |
+
- Graph query functions in `query_graph_functions/` directory
|
| 17 |
+
- Dual RAG modes: Traditional Vector RAG + Advanced GraphRAG
|
| 18 |
+
- **Note**: More improvements planned based on [Microsoft GraphRAG Project](https://github.com/microsoft/graphrag)
|
| 19 |
+
|
| 20 |
+
#### LLM Provider Support
|
| 21 |
+
- **Cerebras API Integration**: Added support for Cerebras ultra-fast inference
|
| 22 |
+
- **Google Gemini API Integration**: Added support for Google's Gemini models
|
| 23 |
+
- **LiteLLM Framework**: Implemented `litellm_patch.py` for unified LLM API interface
|
| 24 |
+
- Supports multiple providers: OpenAI, Replicate, Nebius, Cerebras, Gemini, Anthropic, and more
|
| 25 |
+
- Simplified provider switching via environment variables
|
| 26 |
+
|
| 27 |
+
#### Database Solutions
|
| 28 |
+
- **Zilliz Cloud Integration**: Added cloud-based vector database support
|
| 29 |
+
- Implemented `3_save_to_vector_db_zilliz.py` for Zilliz Cloud
|
| 30 |
+
- Cloud vector database eliminates need for local Milvus server
|
| 31 |
+
- Configurable via `VECTOR_DB_TYPE` environment variable
|
| 32 |
+
- **Neo4j Graph Database**: Integrated for GraphRAG knowledge graph storage
|
| 33 |
+
- Stores entities, relationships, and community structures
|
| 34 |
+
- Enables complex graph traversal queries
|
| 35 |
+
|
| 36 |
+
#### Docker Deployment System
|
| 37 |
+
- **Three Deployment Modes**: Flexible deployment configurations
|
| 38 |
+
- **Cloud Mode** (`docker-compose.cloud.yml`): Cloud LLM + Cloud Zilliz vector DB
|
| 39 |
+
- **Hybrid Mode** (`docker-compose.hybrid.yml`): Cloud LLM + Local Milvus
|
| 40 |
+
- **Local Mode** (`docker-compose.local.yml`): Local Ollama + Local Milvus
|
| 41 |
+
- **Automated Deployment Script**: `docker-startup.sh` orchestrates full deployment
|
| 42 |
+
- Conditional service startup based on deployment mode
|
| 43 |
+
- Automatic Ollama model download for local mode
|
| 44 |
+
- Smart service detection and initialization
|
| 45 |
+
|
| 46 |
+
#### Automatic Pipeline Execution
|
| 47 |
+
- **End-to-End Automation**: Single command deployment from crawling to running application
|
| 48 |
+
- Automatic website crawling when `WEBSITE_URL` is set
|
| 49 |
+
- Sequential pipeline execution: crawl → process → vector DB → graph processing → graph DB
|
| 50 |
+
- Automatic application startup after pipeline completion
|
| 51 |
+
- Controlled via `AUTO_RUN_PIPELINE` environment variable
|
| 52 |
+
- **User Action Required**: Only set environment variables in `.env` file
|
| 53 |
+
|
| 54 |
+
#### Document Processing Improvements
|
| 55 |
+
- **HTML/HTM Processing**: Switched to `html2text` library for better HTML parsing
|
| 56 |
+
- Improved markdown conversion quality
|
| 57 |
+
- Better handling of HTML structure and formatting
|
| 58 |
+
- **Resolves**: [Issue #50](https://github.com/The-AI-Alliance/allycat/issues/50)
|
| 59 |
+
- **PDF Processing**: Integrated `docling` library for advanced PDF parsing
|
| 60 |
+
- High-quality PDF to markdown conversion
|
| 61 |
+
- Preserves document structure and formatting
|
| 62 |
+
- Handles complex PDF layouts
|
| 63 |
+
|
| 64 |
+
#### Port Management System
|
| 65 |
+
- **Multiple Dynamic Ports**: Flexible port configuration for different services
|
| 66 |
+
- `FLASK_GRAPH_PORT=8080` - Flask GraphRAG application
|
| 67 |
+
- `FLASK_VECTOR_PORT=8081` - Flask Vector RAG application
|
| 68 |
+
- `CHAINLIT_GRAPH_PORT=8083` - Chainlit GraphRAG application
|
| 69 |
+
- `CHAINLIT_VECTOR_PORT=8082` - Chainlit Vector RAG application
|
| 70 |
+
- `DOCKER_PORT` - Host machine port mapping
|
| 71 |
+
- `DOCKER_APP_PORT=8080` - Internal container port
|
| 72 |
+
- `OLLAMA_PORT=11434` - Ollama server port (local mode)
|
| 73 |
+
- **Smart Port Routing**: Automatic port selection based on `APP_TYPE` environment variable
|
| 74 |
+
- Supports: `flask_graph`, `flask`, `chainlit_graph`, `chainlit`
|
| 75 |
+
|
| 76 |
+
#### Memory Optimization System
|
| 77 |
+
- **CLEANUP_PIPELINE_DEPS Feature**: Post-pipeline dependency cleanup
|
| 78 |
+
- Created `requirements-runtime.txt` (~300 MB) - minimal packages for running Flask GraphRAG app
|
| 79 |
+
- Created `requirements-build.txt` (~500 MB) - pipeline-only packages that can be removed
|
| 80 |
+
- Created `cleanup_pipeline_deps.sh` - automated cleanup script
|
| 81 |
+
- Integrated cleanup into `docker-startup.sh` with conditional execution
|
| 82 |
+
- Added `CLEANUP_PIPELINE_DEPS` configuration to all `.env` sample files
|
| 83 |
+
- Created comprehensive technical documentation in `docs/docker-memory-optimization.md`
|
| 84 |
+
- Updated `docs/docker-deployment-guide.md` with memory optimization section
|
| 85 |
+
- **Benefits**: Reduces container RAM from ~800 MB to ~300 MB, enabling 1GB deployments
|
| 86 |
+
- **Cost Savings**: DigitalOcean 1GB ($12/mo) vs 2GB ($25/mo) = $156/year savings (52% reduction)
|
| 87 |
+
|
| 88 |
+
### Changed
|
| 89 |
+
- **Chainlit Port Configuration**: Reverted Chainlit apps to use default port behavior
|
| 90 |
+
- Removed custom port configuration code from Python files
|
| 91 |
+
- Chainlit now uses default port 8000 for native Python execution
|
| 92 |
+
- Docker deployments use custom ports via `--port` flag in `docker-startup.sh`
|
| 93 |
+
- Updated documentation in `docs/graphrag-demo/Setup.md`, `docs/configuration.md`, and `my_config.py`
|
| 94 |
+
- Native Python: `chainlit run app_chainlit_graph.py` (port 8000) or with custom `--port 8083`
|
| 95 |
+
- Docker: Custom ports 8082 (vector) and 8083 (graph) configured via environment variables
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
### Fixed
|
| 99 |
+
- **HTML/HTM File Processing**: Fixed HTML parsing issues ([Issue #50](https://github.com/The-AI-Alliance/allycat/issues/50))
|
| 100 |
+
- Switched from previous parser to `html2text` library
|
| 101 |
+
- Improved markdown conversion quality and reliability
|
| 102 |
+
- **Pipeline Error Handling**: Confirmed robust error handling with `|| echo "Warning..."` pattern
|
| 103 |
+
- Cleanup script runs even if pipeline steps fail
|
| 104 |
+
- Application starts successfully regardless of pipeline completion status
|
| 105 |
+
|
| 106 |
+
## [Previous Versions]
|
| 107 |
+
|
| 108 |
+
### 2025-07-14: Major Update
|
| 109 |
+
|
| 110 |
+
#### Added
|
| 111 |
+
- **Robust Web Crawler** ([#31](https://github.com/The-AI-Alliance/allycat/issues/31))
|
| 112 |
+
- Complete rewrite of web crawler
|
| 113 |
+
- More robust handling of edge cases
|
| 114 |
+
- Support for multiple file types (not just text/html)
|
| 115 |
+
- Correct handling of anchor tags (`a.html#news`) in HTML files
|
| 116 |
+
- Customizable pause between requests to avoid hammering webservers
|
| 117 |
+
- Fixed issue with repeatedly downloading same content
|
| 118 |
+
|
| 119 |
+
- **LiteLLM Integration** ([#34](https://github.com/The-AI-Alliance/allycat/issues/34))
|
| 120 |
+
- Unified LLM backend support replacing Replicate and Ollama setup
|
| 121 |
+
- Seamless access to local LLMs (using Ollama) and cloud inference providers
|
| 122 |
+
- Support for providers: Nebius, Replicate, and more
|
| 123 |
+
- Significantly simplified LLM configuration
|
| 124 |
+
- Added `python-dotenv` for environment variable management
|
| 125 |
+
|
| 126 |
+
- **Expanded File Type Support** ([#37](https://github.com/The-AI-Alliance/allycat/issues/37))
|
| 127 |
+
- Support for PDF, DOCX, and other popular file types (previously only HTML)
|
| 128 |
+
- Integration with [Docling](https://github.com/docling-project/docling) for file processing
|
| 129 |
+
- Fixed issue with PDF downloads ([#35](https://github.com/The-AI-Alliance/allycat/issues/35))
|
| 130 |
+
- Processing all downloaded file types
|
| 131 |
+
- Updated process_file script
|
| 132 |
+
|
| 133 |
+
- **UV Package Manager Support** ([#26](https://github.com/The-AI-Alliance/allycat/issues/26))
|
| 134 |
+
- Added [uv](https://docs.astral.sh/uv/) project structure
|
| 135 |
+
- Updated documentation for uv
|
| 136 |
+
- Continued support for `requirements.txt` and other package managers
|
| 137 |
+
|
| 138 |
+
- **Better Config Management** ([#19](https://github.com/The-AI-Alliance/allycat/issues/19))
|
| 139 |
+
- User configuration via `.env` file
|
| 140 |
+
- Simplified config management
|
| 141 |
+
- Easier experimentation without code changes
|
| 142 |
+
- Documented configuration options
|
| 143 |
+
- Updated env.sample file with settings
|
| 144 |
+
|
| 145 |
+
- **Metrics Collection**: Added metrics collection scripts and issue templates
|
| 146 |
+
|
| 147 |
+
#### Changed
|
| 148 |
+
- **Chainlit App Updates** ([#38](https://github.com/The-AI-Alliance/allycat/issues/38))
|
| 149 |
+
- Updated Chainlit application
|
| 150 |
+
- Customizable starter prompts
|
| 151 |
+
- **Logo Updates** ([#39](https://github.com/The-AI-Alliance/allycat/issues/39))
|
| 152 |
+
- Updated logo to AllyCAT
|
| 153 |
+
- **App Naming**: Changed Flask and Chainlit app names for clarity
|
| 154 |
+
- Code cleanup improvements
|
| 155 |
+
- Documentation updates across the project
|
| 156 |
+
|
| 157 |
+
### 2025-05: Chainlit Integration
|
| 158 |
+
|
| 159 |
+
#### Added
|
| 160 |
+
- **Chainlit Chat Interface** ([#17](https://github.com/The-AI-Alliance/allycat/issues/17))
|
| 161 |
+
- Introduced Chainlit-based chat interface as alternative to Flask UI
|
| 162 |
+
- Improved chat UI experience
|
| 163 |
+
|
| 164 |
+
#### Changed
|
| 165 |
+
- Updated README with license and issues links
|
| 166 |
+
|
| 167 |
+
### 2025-04: Dockerization and Local LLM Support
|
| 168 |
+
|
| 169 |
+
#### Added
|
| 170 |
+
- **Docker Support**: Complete dockerization of the application
|
| 171 |
+
- Docker deployment configurations
|
| 172 |
+
- Updated Google Cloud deployment guide
|
| 173 |
+
- Comprehensive Docker documentation (`running-in-docker.md`)
|
| 174 |
+
- **Ollama Integration**: Local LLM support with Ollama
|
| 175 |
+
- Local LLM configuration and setup
|
| 176 |
+
- Local Jupyter Lab support
|
| 177 |
+
- Small tweaks to local LLM config
|
| 178 |
+
- **Python Scripts**: Added Python script versions of notebooks
|
| 179 |
+
|
| 180 |
+
#### Changed
|
| 181 |
+
- Updated deployment documentation
|
| 182 |
+
- Native running documentation (`running-natively.md`)
|
| 183 |
+
|
| 184 |
+
### 2025-03: Database and LLM Updates
|
| 185 |
+
|
| 186 |
+
#### Added
|
| 187 |
+
- **Weaviate Database**: Added Weaviate vector database support
|
| 188 |
+
- **Local LLM Support**: Initial local LLM integration
|
| 189 |
+
|
| 190 |
+
#### Changed
|
| 191 |
+
- **LLM Switch**: Changed from initial LLM to Llama
|
| 192 |
+
- Added logo and GitHub link to UI
|
| 193 |
+
- README and deploy guide updates
|
| 194 |
+
|
| 195 |
+
### 2025-02: Initial Release - AllyCAT (formerly AllyChat)
|
| 196 |
+
|
| 197 |
+
#### Added
|
| 198 |
+
- **Initial Vector RAG System**: First version of AllyCAT
|
| 199 |
+
- Basic RAG implementation
|
| 200 |
+
- Vector database for document storage and retrieval
|
| 201 |
+
- Query system for document Q&A
|
| 202 |
+
- **Flask Web Interface**: Web-based chat interface
|
| 203 |
+
- **Basic Crawling**: Initial website crawling functionality
|
| 204 |
+
- **Document Processing**: Basic document processing pipeline
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
---
|
Dockerfile
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
# Build arguments for conditional installation
|
| 4 |
+
ARG INSTALL_OLLAMA=false
|
| 5 |
+
ARG INSTALL_LOCAL_VECTOR_DB=false
|
| 6 |
+
|
| 7 |
+
# Set working directory
|
| 8 |
+
WORKDIR /allycat
|
| 9 |
+
|
| 10 |
+
# Set environment variables - Cloud-first defaults
|
| 11 |
+
ENV PYTHONDONTWRITEBYTECODE=1 \
|
| 12 |
+
PYTHONUNBUFFERED=1 \
|
| 13 |
+
LLM_RUN_ENV=cloud \
|
| 14 |
+
VECTOR_DB_TYPE=cloud_zilliz
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
# Install minimal system dependencies
|
| 18 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 19 |
+
bash \
|
| 20 |
+
curl \
|
| 21 |
+
git \
|
| 22 |
+
netcat-traditional \
|
| 23 |
+
&& apt-get clean \
|
| 24 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 25 |
+
|
| 26 |
+
# Copy requirements file - Use cloud-optimized by default
|
| 27 |
+
COPY requirements-docker-cloud.txt .
|
| 28 |
+
|
| 29 |
+
# Install Python dependencies
|
| 30 |
+
RUN pip install --no-cache-dir -r requirements-docker-cloud.txt
|
| 31 |
+
|
| 32 |
+
# Conditional: Install Ollama only if requested
|
| 33 |
+
RUN if [ "$INSTALL_OLLAMA" = "true" ]; then \
|
| 34 |
+
echo "Installing Ollama for local LLM support..."; \
|
| 35 |
+
curl -fsSL https://ollama.com/install.sh | sh; \
|
| 36 |
+
else \
|
| 37 |
+
echo "Skipping Ollama installation - using cloud LLM mode"; \
|
| 38 |
+
fi
|
| 39 |
+
|
| 40 |
+
# Conditional: Install local vector DB dependencies
|
| 41 |
+
RUN if [ "$INSTALL_LOCAL_VECTOR_DB" = "true" ]; then \
|
| 42 |
+
echo "Installing milvus-lite for local vector database..."; \
|
| 43 |
+
pip install --no-cache-dir milvus-lite==2.4.11; \
|
| 44 |
+
else \
|
| 45 |
+
echo "Skipping local vector DB - using Zilliz Cloud"; \
|
| 46 |
+
fi
|
| 47 |
+
|
| 48 |
+
# Copy project files
|
| 49 |
+
COPY . .
|
| 50 |
+
RUN chmod +x ./docker-startup.sh
|
| 51 |
+
|
| 52 |
+
# Cleanup unnecessary files
|
| 53 |
+
RUN rm -rf .env workspace/* __pycache__ *.pyc
|
| 54 |
+
|
| 55 |
+
# Expose all application ports (EXPOSE doesn't support env variables at build time)
|
| 56 |
+
# Port 8080 = FLASK_GRAPH_PORT (default) / DOCKER_APP_PORT (default)
|
| 57 |
+
# Port 8081 = FLASK_VECTOR_PORT (default)
|
| 58 |
+
# Port 8082 = CHAINLIT_VECTOR_PORT (default)
|
| 59 |
+
# Port 8083 = CHAINLIT_GRAPH_PORT (default)
|
| 60 |
+
# Port mapping controlled by docker-compose.yml: ${DOCKER_PORT}:${DOCKER_APP_PORT}
|
| 61 |
+
EXPOSE 8080 8081 8082 8083
|
| 62 |
+
# Port 11434 = OLLAMA_PORT (default) - only used if INSTALL_OLLAMA=true
|
| 63 |
+
EXPOSE 11434
|
| 64 |
+
|
| 65 |
+
ENTRYPOINT ["./docker-startup.sh"]
|
| 66 |
+
CMD ["deploy"]
|
Dockerfile-dev
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
# Build arguments for conditional installation
|
| 4 |
+
ARG INSTALL_OLLAMA=false
|
| 5 |
+
ARG INSTALL_LOCAL_VECTOR_DB=false
|
| 6 |
+
|
| 7 |
+
# Set working directory
|
| 8 |
+
WORKDIR /allycat
|
| 9 |
+
|
| 10 |
+
# Set environment variables - Cloud-first defaults
|
| 11 |
+
ENV PYTHONDONTWRITEBYTECODE=1 \
|
| 12 |
+
PYTHONUNBUFFERED=1 \
|
| 13 |
+
LLM_RUN_ENV=cloud \
|
| 14 |
+
VECTOR_DB_TYPE=cloud_zilliz
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
# Install minimal system dependencies
|
| 18 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 19 |
+
bash \
|
| 20 |
+
curl \
|
| 21 |
+
git \
|
| 22 |
+
netcat-traditional \
|
| 23 |
+
&& apt-get clean \
|
| 24 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 25 |
+
|
| 26 |
+
# Copy requirements file - Use cloud-optimized by default
|
| 27 |
+
COPY requirements-docker-cloud.txt .
|
| 28 |
+
|
| 29 |
+
# Install Python dependencies
|
| 30 |
+
RUN pip install --no-cache-dir -r requirements-docker-cloud.txt
|
| 31 |
+
|
| 32 |
+
# Conditional: Install Ollama only if requested
|
| 33 |
+
RUN if [ "$INSTALL_OLLAMA" = "true" ]; then \
|
| 34 |
+
echo "Installing Ollama for local LLM support..."; \
|
| 35 |
+
curl -fsSL https://ollama.com/install.sh | sh; \
|
| 36 |
+
else \
|
| 37 |
+
echo "Skipping Ollama installation - using cloud LLM mode"; \
|
| 38 |
+
fi
|
| 39 |
+
|
| 40 |
+
# Conditional: Install local vector DB dependencies
|
| 41 |
+
RUN if [ "$INSTALL_LOCAL_VECTOR_DB" = "true" ]; then \
|
| 42 |
+
echo "Installing milvus-lite for local vector database..."; \
|
| 43 |
+
pip install --no-cache-dir milvus-lite==2.4.11; \
|
| 44 |
+
else \
|
| 45 |
+
echo "Skipping local vector DB - using Zilliz Cloud"; \
|
| 46 |
+
fi
|
| 47 |
+
|
| 48 |
+
# Copy project files
|
| 49 |
+
COPY . .
|
| 50 |
+
RUN chmod +x ./docker-startup.sh
|
| 51 |
+
|
| 52 |
+
# Cleanup unnecessary files
|
| 53 |
+
RUN rm -rf .env workspace/* __pycache__ *.pyc
|
| 54 |
+
|
| 55 |
+
# Expose all application ports (EXPOSE doesn't support env variables at build time)
|
| 56 |
+
# Port 8080 = FLASK_GRAPH_PORT (default) / DOCKER_APP_PORT (default)
|
| 57 |
+
# Port 8081 = FLASK_VECTOR_PORT (default)
|
| 58 |
+
# Port 8082 = CHAINLIT_VECTOR_PORT (default)
|
| 59 |
+
# Port 8083 = CHAINLIT_GRAPH_PORT (default)
|
| 60 |
+
# Port mapping controlled by docker-compose.yml: ${DOCKER_PORT}:${DOCKER_APP_PORT}
|
| 61 |
+
EXPOSE 8080 8081 8082 8083
|
| 62 |
+
# Port 11434 = OLLAMA_PORT (default) - only used if INSTALL_OLLAMA=true
|
| 63 |
+
EXPOSE 11434
|
| 64 |
+
|
| 65 |
+
ENTRYPOINT ["./docker-startup.sh"]
|
LICENSE
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Apache License
|
| 2 |
+
Version 2.0, January 2004
|
| 3 |
+
http://www.apache.org/licenses/
|
| 4 |
+
|
| 5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
| 6 |
+
|
| 7 |
+
1. Definitions.
|
| 8 |
+
|
| 9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
| 10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
| 11 |
+
|
| 12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
| 13 |
+
the copyright owner that is granting the License.
|
| 14 |
+
|
| 15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
| 16 |
+
other entities that control, are controlled by, or are under common
|
| 17 |
+
control with that entity. For the purposes of this definition,
|
| 18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
| 19 |
+
direction or management of such entity, whether by contract or
|
| 20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
| 21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
| 22 |
+
|
| 23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
| 24 |
+
exercising permissions granted by this License.
|
| 25 |
+
|
| 26 |
+
"Source" form shall mean the preferred form for making modifications,
|
| 27 |
+
including but not limited to software source code, documentation
|
| 28 |
+
source, and configuration files.
|
| 29 |
+
|
| 30 |
+
"Object" form shall mean any form resulting from mechanical
|
| 31 |
+
transformation or translation of a Source form, including but
|
| 32 |
+
not limited to compiled object code, generated documentation,
|
| 33 |
+
and conversions to other media types.
|
| 34 |
+
|
| 35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
| 36 |
+
Object form, made available under the License, as indicated by a
|
| 37 |
+
copyright notice that is included in or attached to the work
|
| 38 |
+
(an example is provided in the Appendix below).
|
| 39 |
+
|
| 40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
| 41 |
+
form, that is based on (or derived from) the Work and for which the
|
| 42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
| 43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
| 44 |
+
of this License, Derivative Works shall not include works that remain
|
| 45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
| 46 |
+
the Work and Derivative Works thereof.
|
| 47 |
+
|
| 48 |
+
"Contribution" shall mean any work of authorship, including
|
| 49 |
+
the original version of the Work and any modifications or additions
|
| 50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
| 51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
| 52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
| 53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
| 54 |
+
means any form of electronic, verbal, or written communication sent
|
| 55 |
+
to the Licensor or its representatives, including but not limited to
|
| 56 |
+
communication on electronic mailing lists, source code control systems,
|
| 57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
| 58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
| 59 |
+
excluding communication that is conspicuously marked or otherwise
|
| 60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
| 61 |
+
|
| 62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
| 63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
| 64 |
+
subsequently incorporated within the Work.
|
| 65 |
+
|
| 66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
| 67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
| 68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
| 70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
| 71 |
+
Work and such Derivative Works in Source or Object form.
|
| 72 |
+
|
| 73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
| 74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
| 75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 76 |
+
(except as stated in this section) patent license to make, have made,
|
| 77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
| 78 |
+
where such license applies only to those patent claims licensable
|
| 79 |
+
by such Contributor that are necessarily infringed by their
|
| 80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
| 81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
| 82 |
+
institute patent litigation against any entity (including a
|
| 83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
| 84 |
+
or a Contribution incorporated within the Work constitutes direct
|
| 85 |
+
or contributory patent infringement, then any patent licenses
|
| 86 |
+
granted to You under this License for that Work shall terminate
|
| 87 |
+
as of the date such litigation is filed.
|
| 88 |
+
|
| 89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
| 90 |
+
Work or Derivative Works thereof in any medium, with or without
|
| 91 |
+
modifications, and in Source or Object form, provided that You
|
| 92 |
+
meet the following conditions:
|
| 93 |
+
|
| 94 |
+
(a) You must give any other recipients of the Work or
|
| 95 |
+
Derivative Works a copy of this License; and
|
| 96 |
+
|
| 97 |
+
(b) You must cause any modified files to carry prominent notices
|
| 98 |
+
stating that You changed the files; and
|
| 99 |
+
|
| 100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
| 101 |
+
that You distribute, all copyright, patent, trademark, and
|
| 102 |
+
attribution notices from the Source form of the Work,
|
| 103 |
+
excluding those notices that do not pertain to any part of
|
| 104 |
+
the Derivative Works; and
|
| 105 |
+
|
| 106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
| 107 |
+
distribution, then any Derivative Works that You distribute must
|
| 108 |
+
include a readable copy of the attribution notices contained
|
| 109 |
+
within such NOTICE file, excluding those notices that do not
|
| 110 |
+
pertain to any part of the Derivative Works, in at least one
|
| 111 |
+
of the following places: within a NOTICE text file distributed
|
| 112 |
+
as part of the Derivative Works; within the Source form or
|
| 113 |
+
documentation, if provided along with the Derivative Works; or,
|
| 114 |
+
within a display generated by the Derivative Works, if and
|
| 115 |
+
wherever such third-party notices normally appear. The contents
|
| 116 |
+
of the NOTICE file are for informational purposes only and
|
| 117 |
+
do not modify the License. You may add Your own attribution
|
| 118 |
+
notices within Derivative Works that You distribute, alongside
|
| 119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
| 120 |
+
that such additional attribution notices cannot be construed
|
| 121 |
+
as modifying the License.
|
| 122 |
+
|
| 123 |
+
You may add Your own copyright statement to Your modifications and
|
| 124 |
+
may provide additional or different license terms and conditions
|
| 125 |
+
for use, reproduction, or distribution of Your modifications, or
|
| 126 |
+
for any such Derivative Works as a whole, provided Your use,
|
| 127 |
+
reproduction, and distribution of the Work otherwise complies with
|
| 128 |
+
the conditions stated in this License.
|
| 129 |
+
|
| 130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
| 131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
| 132 |
+
by You to the Licensor shall be under the terms and conditions of
|
| 133 |
+
this License, without any additional terms or conditions.
|
| 134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
| 135 |
+
the terms of any separate license agreement you may have executed
|
| 136 |
+
with Licensor regarding such Contributions.
|
| 137 |
+
|
| 138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
| 139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
| 140 |
+
except as required for reasonable and customary use in describing the
|
| 141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
| 142 |
+
|
| 143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
| 144 |
+
agreed to in writing, Licensor provides the Work (and each
|
| 145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
| 146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
| 147 |
+
implied, including, without limitation, any warranties or conditions
|
| 148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
| 149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
| 150 |
+
appropriateness of using or redistributing the Work and assume any
|
| 151 |
+
risks associated with Your exercise of permissions under this License.
|
| 152 |
+
|
| 153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
| 154 |
+
whether in tort (including negligence), contract, or otherwise,
|
| 155 |
+
unless required by applicable law (such as deliberate and grossly
|
| 156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
| 157 |
+
liable to You for damages, including any direct, indirect, special,
|
| 158 |
+
incidental, or consequential damages of any character arising as a
|
| 159 |
+
result of this License or out of the use or inability to use the
|
| 160 |
+
Work (including but not limited to damages for loss of goodwill,
|
| 161 |
+
work stoppage, computer failure or malfunction, or any and all
|
| 162 |
+
other commercial damages or losses), even if such Contributor
|
| 163 |
+
has been advised of the possibility of such damages.
|
| 164 |
+
|
| 165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
| 166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
| 167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
| 168 |
+
or other liability obligations and/or rights consistent with this
|
| 169 |
+
License. However, in accepting such obligations, You may act only
|
| 170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
| 171 |
+
of any other Contributor, and only if You agree to indemnify,
|
| 172 |
+
defend, and hold each Contributor harmless for any liability
|
| 173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
| 174 |
+
of your accepting any such warranty or additional liability.
|
| 175 |
+
|
| 176 |
+
END OF TERMS AND CONDITIONS
|
| 177 |
+
|
| 178 |
+
APPENDIX: How to apply the Apache License to your work.
|
| 179 |
+
|
| 180 |
+
To apply the Apache License to your work, attach the following
|
| 181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
| 182 |
+
replaced with your own identifying information. (Don't include
|
| 183 |
+
the brackets!) The text should be enclosed in the appropriate
|
| 184 |
+
comment syntax for the file format. We also recommend that a
|
| 185 |
+
file or class name and description of purpose be included on the
|
| 186 |
+
same "printed page" as the copyright notice for easier
|
| 187 |
+
identification within third-party archives.
|
| 188 |
+
|
| 189 |
+
Copyright [yyyy] [name of copyright owner]
|
| 190 |
+
|
| 191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
| 192 |
+
you may not use this file except in compliance with the License.
|
| 193 |
+
You may obtain a copy of the License at
|
| 194 |
+
|
| 195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
| 196 |
+
|
| 197 |
+
Unless required by applicable law or agreed to in writing, software
|
| 198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
| 199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 200 |
+
See the License for the specific language governing permissions and
|
| 201 |
+
limitations under the License.
|
README.md
CHANGED
|
@@ -1,10 +1,75 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<img src="assets/allycat.png" alt="Alley Cat" width="200"/>
|
| 2 |
+
|
| 3 |
+
[](https://github.com/The-AI-Alliance/allycat/blob/main/LICENSE)
|
| 4 |
+
[](https://github.com/The-AI-Alliance/allycat/issues)
|
| 5 |
+

|
| 6 |
+
|
| 7 |
+
# AllyCat
|
| 8 |
+
|
| 9 |
+
**AllyCat** is a full stack, open source chatbot that uses GenAI LLMs to answer questions about your website with both traditional Vector RAG and advanced GraphRAG capabilities. It is simple by design and will run on your laptop or server.
|
| 10 |
+
|
| 11 |
+
## Why?
|
| 12 |
+
|
| 13 |
+
AllyCat is purposefully simple so it can be used by developers to learn how RAG-based GenAI works. Yet it is powerful enough to use with your website. You may also extend it for your own purposes.
|
| 14 |
+
|
| 15 |
+
⭐ **Found this tool helpful? Give it a star on GitHub to support the project and help others discover it!**
|
| 16 |
+
|
| 17 |
+
**🗞️ [Allycat news](news.md)** - releases and new features!
|
| 18 |
+
|
| 19 |
+
## How does it work?
|
| 20 |
+
AllyCat uses your choice of LLM and vector database to implement a chatbot written in Python using [RAG](https://en.wikipedia.org/wiki/Retrieval-augmented_generation) architecture.
|
| 21 |
+
AllyCat also includes web scraping tools that extract data from your website (or any website).
|
| 22 |
+
|
| 23 |
+
## 🌟🌟 Features 🌟🌟
|
| 24 |
+
|
| 25 |
+
1. **Dual RAG Modes:** Traditional Vector RAG and Advanced GraphRAG with entity extraction and community detection
|
| 26 |
+
2. **Web Crawling & Scraping:** Text extraction, data/HTML processing, conversion to markdown.
|
| 27 |
+
- **Currently uses:** [Data Prep Kit Connector](https://github.com/data-prep-kit/data-prep-kit/blob/dev/data-connector-lib/doc/overview.md) and [Docling](https://github.com/docling-project/docling)
|
| 28 |
+
3. **Processing:** Chunking, vector embedding creation, saving to vector database.
|
| 29 |
+
- **Currently uses:** [Llama Index](https://docs.llamaindex.ai/en/stable/) and [Granite Embedding Model](https://huggingface.co/ibm-granite/granite-embedding-30m-english)
|
| 30 |
+
4. **Multiple LLM Support:**
|
| 31 |
+
- **Local:** [Ollama](https://ollama.com/) with [Llama](https://www.llama.com) or [Granite](https://huggingface.co/collections/ibm-granite/granite-33-language-models-67f65d0cca24bcbd1d3a08e3)
|
| 32 |
+
- **Cloud:** OpenAI, Cerebras, Google Gemini, Replicate, Nebius, and more via [LiteLLM](https://docs.litellm.ai/docs)
|
| 33 |
+
5. **Multiple Database Support:**
|
| 34 |
+
- **Vector:** [Milvus](https://milvus.io/) (local/embedded) or [Zilliz Cloud](https://zilliz.com/)
|
| 35 |
+
- **Graph:** [Neo4j](https://neo4j.com/) for GraphRAG knowledge graphs
|
| 36 |
+
6. **Flexible Deployment:** Docker support with 3 modes (Cloud, Hybrid, Local) + Native Python
|
| 37 |
+
7. **Chatbot Interfaces:** Flask web UI and Chainlit chat interface
|
| 38 |
+
|
| 39 |
+
## ⚡️⚡️Quickstart ⚡️⚡️
|
| 40 |
+
|
| 41 |
+
There are two ways to run Allycat.
|
| 42 |
+
|
| 43 |
+
### Option 1: Use the Docker image
|
| 44 |
+
|
| 45 |
+
A great option for a quick evaluation.
|
| 46 |
+
See [running AllyCat using docker](docs/running-in-docker.md)
|
| 47 |
+
|
| 48 |
+
### Option 2: Run natively (for tweaking, developing)
|
| 49 |
+
|
| 50 |
+
Choose this option if you want to tweak AllyCat to fit your needs. For example, experimenting with embedding models or LLMs.
|
| 51 |
+
See [running AllyCat natively](docs/running-natively.md)
|
| 52 |
+
|
| 53 |
+
## AllyCat Workflow
|
| 54 |
+
|
| 55 |
+

|
| 56 |
+
|
| 57 |
+
See [running allycat](docs/running-allycat.md)
|
| 58 |
+
|
| 59 |
+
## Customizing AllyCat
|
| 60 |
+
|
| 61 |
+
See [customizing allycat](docs/customizing-allycat.md)
|
| 62 |
+
|
| 63 |
+
## Deploying AllyCat
|
| 64 |
+
|
| 65 |
+
See [deployment guide](docs/deploy.md)
|
| 66 |
+
|
| 67 |
+
## Developing AllyCat
|
| 68 |
+
|
| 69 |
+
See [developing allycat](docs/developing-allycat.md)
|
| 70 |
+
|
| 71 |
+
## Why the name **AllyCat**?
|
| 72 |
+
|
| 73 |
+
Originally AllianceChat, we shortened it to AllyCat when we learned chat means cat in French. Who doesn't love cats?!
|
| 74 |
+
|
| 75 |
+
|
app_chainlit.py
ADDED
|
@@ -0,0 +1,299 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import chainlit as cl
|
| 2 |
+
import os
|
| 3 |
+
import logging
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
import time
|
| 6 |
+
import asyncio
|
| 7 |
+
import re
|
| 8 |
+
import logging
|
| 9 |
+
|
| 10 |
+
# Import llama-index and related libraries
|
| 11 |
+
from llama_index.core import VectorStoreIndex, StorageContext
|
| 12 |
+
from llama_index.vector_stores.milvus import MilvusVectorStore
|
| 13 |
+
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
| 14 |
+
from llama_index.core import Settings
|
| 15 |
+
from llama_index.llms.litellm import LiteLLM
|
| 16 |
+
from my_config import MY_CONFIG
|
| 17 |
+
import query_utils
|
| 18 |
+
|
| 19 |
+
# Global variables for LLM and index
|
| 20 |
+
vector_index = None
|
| 21 |
+
initialization_complete = False
|
| 22 |
+
|
| 23 |
+
# Create logs directory if it doesn't exist
|
| 24 |
+
os.makedirs('logs/chainlit', exist_ok=True)
|
| 25 |
+
|
| 26 |
+
logging.basicConfig(level=logging.WARNING,
|
| 27 |
+
format='%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s',
|
| 28 |
+
handlers=[
|
| 29 |
+
logging.FileHandler('logs/chainlit/chainlit_vector.log', mode='a'),
|
| 30 |
+
logging.StreamHandler()
|
| 31 |
+
],
|
| 32 |
+
force=True)
|
| 33 |
+
logger = logging.getLogger(__name__)
|
| 34 |
+
logger.setLevel(logging.INFO)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
|
| 38 |
+
# Load environment variables from .env file
|
| 39 |
+
load_dotenv()
|
| 40 |
+
|
| 41 |
+
def initialize():
|
| 42 |
+
"""
|
| 43 |
+
Initialize LLM and Milvus vector database using llama-index.
|
| 44 |
+
This function sets up the necessary components for the chat application.
|
| 45 |
+
"""
|
| 46 |
+
global vector_index, initialization_complete
|
| 47 |
+
|
| 48 |
+
if initialization_complete:
|
| 49 |
+
return
|
| 50 |
+
|
| 51 |
+
logger.info("Initializing LLM and vector database...")
|
| 52 |
+
|
| 53 |
+
# raise Exception ("init exception test") # debug
|
| 54 |
+
|
| 55 |
+
try:
|
| 56 |
+
## embedding model
|
| 57 |
+
Settings.embed_model = HuggingFaceEmbedding(
|
| 58 |
+
model_name = MY_CONFIG.EMBEDDING_MODEL
|
| 59 |
+
)
|
| 60 |
+
logger.info(f"✅ Using embedding model: {MY_CONFIG.EMBEDDING_MODEL}")
|
| 61 |
+
|
| 62 |
+
# Setup LLM
|
| 63 |
+
logger.info(f"✅ Using LLM model : {MY_CONFIG.LLM_MODEL}")
|
| 64 |
+
Settings.llm = LiteLLM(
|
| 65 |
+
model=MY_CONFIG.LLM_MODEL,
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
# Initialize Milvus vector store
|
| 69 |
+
vector_store = MilvusVectorStore(
|
| 70 |
+
uri = MY_CONFIG.MILVUS_URI_VECTOR,
|
| 71 |
+
dim = MY_CONFIG.EMBEDDING_LENGTH ,
|
| 72 |
+
collection_name = MY_CONFIG.COLLECTION_NAME,
|
| 73 |
+
overwrite=False # so we load the index from db
|
| 74 |
+
)
|
| 75 |
+
storage_context = StorageContext.from_defaults(vector_store=vector_store)
|
| 76 |
+
logger.info (f"✅ Connected to Milvus instance: {MY_CONFIG.MILVUS_URI_VECTOR}")
|
| 77 |
+
|
| 78 |
+
vector_index = VectorStoreIndex.from_vector_store(
|
| 79 |
+
vector_store=vector_store, storage_context=storage_context)
|
| 80 |
+
logger.info (f"✅ Loaded index from vector db: {MY_CONFIG.MILVUS_URI_VECTOR}")
|
| 81 |
+
|
| 82 |
+
logger.info("Successfully initialized LLM and vector database")
|
| 83 |
+
|
| 84 |
+
initialization_complete = True
|
| 85 |
+
except Exception as e:
|
| 86 |
+
initialization_complete = False
|
| 87 |
+
logger.error(f"Error initializing LLM and vector database: {str(e)}")
|
| 88 |
+
raise (e)
|
| 89 |
+
# return False
|
| 90 |
+
## -------------
|
| 91 |
+
|
| 92 |
+
def extract_thinking_section(response_text):
|
| 93 |
+
"""
|
| 94 |
+
Extract thinking section from LLM response if present.
|
| 95 |
+
|
| 96 |
+
Args:
|
| 97 |
+
response_text (str): The full response from the LLM
|
| 98 |
+
|
| 99 |
+
Returns:
|
| 100 |
+
tuple: (thinking_content, cleaned_response)
|
| 101 |
+
- thinking_content: Content within <think></think> tags or None if not found
|
| 102 |
+
- cleaned_response: Response with thinking section removed
|
| 103 |
+
"""
|
| 104 |
+
thinking_pattern = r'<think>(.*?)</think>'
|
| 105 |
+
match = re.search(thinking_pattern, response_text, re.DOTALL)
|
| 106 |
+
|
| 107 |
+
if match:
|
| 108 |
+
thinking_content = match.group(1).strip()
|
| 109 |
+
cleaned_response = re.sub(thinking_pattern, '', response_text, flags=re.DOTALL).strip()
|
| 110 |
+
return thinking_content, cleaned_response
|
| 111 |
+
else:
|
| 112 |
+
return None, response_text
|
| 113 |
+
|
| 114 |
+
async def get_llm_response(message):
|
| 115 |
+
"""
|
| 116 |
+
Process the user message and get a response from the LLM using Vector RAG
|
| 117 |
+
with structured prompt
|
| 118 |
+
"""
|
| 119 |
+
global vector_index, initialization_complete
|
| 120 |
+
|
| 121 |
+
# Check if LLM and index are initialized
|
| 122 |
+
if vector_index is None or initialization_complete is None:
|
| 123 |
+
return "System did not initialize. Please try again later.", 0
|
| 124 |
+
|
| 125 |
+
start_time = time.time()
|
| 126 |
+
response_text = ''
|
| 127 |
+
|
| 128 |
+
try:
|
| 129 |
+
# Step 1: Query preprocessing
|
| 130 |
+
async with cl.Step(name="Query Preprocessing", type="tool") as step:
|
| 131 |
+
logger.info("Start query preprocessing step...")
|
| 132 |
+
step.input = message
|
| 133 |
+
|
| 134 |
+
# Create a query engine from the index
|
| 135 |
+
query_engine = vector_index.as_query_engine()
|
| 136 |
+
|
| 137 |
+
# Preprocess the query
|
| 138 |
+
original_message = message
|
| 139 |
+
message = query_utils.tweak_query(message, MY_CONFIG.LLM_MODEL)
|
| 140 |
+
|
| 141 |
+
step.output = f"Optimized query: {message}"
|
| 142 |
+
## --- end: Step 1 ---
|
| 143 |
+
|
| 144 |
+
# Query the index with structured prompting
|
| 145 |
+
logger.info("Calling LLM with structured prompting...")
|
| 146 |
+
t1 = time.time()
|
| 147 |
+
|
| 148 |
+
# Get initial vector response
|
| 149 |
+
vector_response = query_engine.query(message)
|
| 150 |
+
vector_text = str(vector_response).strip()
|
| 151 |
+
|
| 152 |
+
# Structured prompt
|
| 153 |
+
structured_prompt = f"""Please provide a comprehensive, well-structured answer using the provided document information.
|
| 154 |
+
|
| 155 |
+
Question: {message}
|
| 156 |
+
|
| 157 |
+
Document Information:
|
| 158 |
+
{vector_text}
|
| 159 |
+
|
| 160 |
+
Instructions:
|
| 161 |
+
1. Provide accurate, factual information based on the documents
|
| 162 |
+
2. Structure your response clearly with proper formatting
|
| 163 |
+
3. Be comprehensive yet concise
|
| 164 |
+
4. Highlight key relationships and important details when relevant
|
| 165 |
+
5. Use bullet points or sections when appropriate for clarity
|
| 166 |
+
|
| 167 |
+
Please provide your answer:"""
|
| 168 |
+
|
| 169 |
+
# Use structured prompt for final synthesis
|
| 170 |
+
response = query_engine.query(structured_prompt)
|
| 171 |
+
|
| 172 |
+
t2 = time.time()
|
| 173 |
+
if response:
|
| 174 |
+
response_text = str(response).strip()
|
| 175 |
+
else:
|
| 176 |
+
response_text = "No response from LLM."
|
| 177 |
+
logger.info(f"LLM response received in {(t2 - t1):.2f} seconds:\n{response_text[:200]}")
|
| 178 |
+
|
| 179 |
+
# Step 2: Vector search and retrieval
|
| 180 |
+
async with cl.Step(name="Document Retrieval", type="retrieval") as step:
|
| 181 |
+
step.input = message
|
| 182 |
+
|
| 183 |
+
# Show retrieved documents
|
| 184 |
+
if hasattr(response, 'source_nodes') and response.source_nodes:
|
| 185 |
+
sources_output = []
|
| 186 |
+
for i, node in enumerate(response.source_nodes[:3]): # Show top 3 sources
|
| 187 |
+
score = node.score if hasattr(node, 'score') else 'N/A'
|
| 188 |
+
text_preview = node.text[:200] + "..." if len(node.text) > 200 else node.text
|
| 189 |
+
sources_output.append(f"Source {i+1} (Score: {score}): {text_preview}")
|
| 190 |
+
step.output = "\n\n".join(sources_output)
|
| 191 |
+
else:
|
| 192 |
+
step.output = "No relevant documents found."
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
# Extract thinking section if present
|
| 196 |
+
thinking_content, cleaned_response = extract_thinking_section(response_text)
|
| 197 |
+
# print (f"------ Thinking Content:-----\n{thinking_content}\n------") # Debug print
|
| 198 |
+
# print (f"------ Cleaned Response:-----\n{cleaned_response}\n------") # Debug print
|
| 199 |
+
|
| 200 |
+
# Step 3: Optional Thinking Process
|
| 201 |
+
if thinking_content:
|
| 202 |
+
async with cl.Step(name="💭 Thinking Process", type="run") as step:
|
| 203 |
+
step.input = ""
|
| 204 |
+
step.output = thinking_content
|
| 205 |
+
logger.info(f"Thinking:\n{thinking_content[:200]}...")
|
| 206 |
+
|
| 207 |
+
# Step 4: LLM Answer
|
| 208 |
+
async with cl.Step(name="Response", type="llm") as step:
|
| 209 |
+
step.input = f"Query: {message}\nContext: Retrieved from vector database"
|
| 210 |
+
|
| 211 |
+
if cleaned_response:
|
| 212 |
+
step.output = cleaned_response
|
| 213 |
+
logger.info(f"Response:\n{cleaned_response[:200]}...")
|
| 214 |
+
else:
|
| 215 |
+
step.output = "No response from LLM."
|
| 216 |
+
logger.info(f"Response:\nNo response from LLM.")
|
| 217 |
+
|
| 218 |
+
except Exception as e:
|
| 219 |
+
logger.error(f"Error getting LLM response: {str(e)}")
|
| 220 |
+
response_text = f"Sorry, I encountered an error while processing your request:\n{str(e)}"
|
| 221 |
+
|
| 222 |
+
end_time = time.time()
|
| 223 |
+
elapsed_time = end_time - start_time
|
| 224 |
+
|
| 225 |
+
return response_text, elapsed_time
|
| 226 |
+
|
| 227 |
+
## --- end: def get_llm_response():
|
| 228 |
+
|
| 229 |
+
# ====== CHAINLIT SPECIFIC CODE ======
|
| 230 |
+
|
| 231 |
+
@cl.set_starters
|
| 232 |
+
async def set_starters():
|
| 233 |
+
starters = []
|
| 234 |
+
for prompt in MY_CONFIG.STARTER_PROMPTS:
|
| 235 |
+
starters.append(
|
| 236 |
+
cl.Starter(
|
| 237 |
+
label=prompt.strip(),
|
| 238 |
+
message=prompt.strip(),
|
| 239 |
+
)
|
| 240 |
+
)
|
| 241 |
+
return starters
|
| 242 |
+
## --- end: def set_starters(): ---
|
| 243 |
+
|
| 244 |
+
@cl.on_chat_start
|
| 245 |
+
async def start():
|
| 246 |
+
"""Initialize the chat session"""
|
| 247 |
+
# Store initialization state in user session
|
| 248 |
+
cl.user_session.set("chat_started", True)
|
| 249 |
+
logger.info("User chat session started")
|
| 250 |
+
init_error = None
|
| 251 |
+
|
| 252 |
+
try:
|
| 253 |
+
initialize()
|
| 254 |
+
# await cl.Message(content="How can I assist you today?").send()
|
| 255 |
+
except Exception as e:
|
| 256 |
+
init_error = str(e)
|
| 257 |
+
error_msg = f"""System Initialization Error
|
| 258 |
+
|
| 259 |
+
The system failed to initialize with the following error:
|
| 260 |
+
|
| 261 |
+
```
|
| 262 |
+
{init_error}
|
| 263 |
+
```
|
| 264 |
+
|
| 265 |
+
Please check your configuration and environment variables."""
|
| 266 |
+
await cl.Message(content=error_msg).send()
|
| 267 |
+
|
| 268 |
+
@cl.on_message
|
| 269 |
+
async def main(message: cl.Message):
|
| 270 |
+
"""Handle incoming messages"""
|
| 271 |
+
user_message = message.content
|
| 272 |
+
|
| 273 |
+
# Get response from LLM with RAG steps shown FIRST
|
| 274 |
+
response_text, elapsed_time = await get_llm_response(user_message)
|
| 275 |
+
# logger.info(f"LLM Response:\n{response_text[:200]}...") # Log first 200 chars
|
| 276 |
+
|
| 277 |
+
thinking_content, cleaned_response = extract_thinking_section(response_text)
|
| 278 |
+
|
| 279 |
+
# Add timing stat to response
|
| 280 |
+
full_response = cleaned_response + f"\n\n⏱️ *Total time: {elapsed_time:.1f} seconds*"
|
| 281 |
+
|
| 282 |
+
# THEN create a new message for streaming
|
| 283 |
+
msg = cl.Message(content="")
|
| 284 |
+
await msg.send()
|
| 285 |
+
|
| 286 |
+
# Stream the response character by character for better UX
|
| 287 |
+
# This simulates streaming - in a real implementation you'd stream from the LLM
|
| 288 |
+
for i in range(0, len(full_response), 5): # Stream in chunks of 5 characters
|
| 289 |
+
await msg.stream_token(full_response[i:i+5])
|
| 290 |
+
await asyncio.sleep(0.01) # Small delay for visual effect
|
| 291 |
+
|
| 292 |
+
# Update the final message
|
| 293 |
+
msg.content = full_response
|
| 294 |
+
await msg.update()
|
| 295 |
+
|
| 296 |
+
## -------
|
| 297 |
+
if __name__ == '__main__':
|
| 298 |
+
logger.info("App starting up...")
|
| 299 |
+
print(f"{'='*60}\n")
|
app_chainlit_graph.py
ADDED
|
@@ -0,0 +1,375 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
GraphRAG Chainlit Application
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import chainlit as cl
|
| 6 |
+
import os
|
| 7 |
+
import logging
|
| 8 |
+
from dotenv import load_dotenv
|
| 9 |
+
import time
|
| 10 |
+
import asyncio
|
| 11 |
+
import re
|
| 12 |
+
from typing import Dict, Any, Tuple
|
| 13 |
+
|
| 14 |
+
# Apply nest_asyncio to allow nested event loops
|
| 15 |
+
import nest_asyncio
|
| 16 |
+
nest_asyncio.apply()
|
| 17 |
+
|
| 18 |
+
# Import Step 1 functionality from setup module
|
| 19 |
+
from query_graph_functions.setup import create_graphrag_setup
|
| 20 |
+
# Import Steps 3-5 functionality from query preprocessing module
|
| 21 |
+
from query_graph_functions.query_preprocessing import create_query_preprocessor, preprocess_query_pipeline
|
| 22 |
+
# Import Steps 6-8 functionality from knowledge retrieval module
|
| 23 |
+
from query_graph_functions.knowledge_retrieval import CommunitySearchEngine
|
| 24 |
+
# Import Steps 9-12 functionality from follow-up search module
|
| 25 |
+
from query_graph_functions.follow_up_search import FollowUpSearch
|
| 26 |
+
# Import Steps 13-14 functionality from vector augmentation module
|
| 27 |
+
from query_graph_functions.vector_augmentation import VectorAugmentationEngine
|
| 28 |
+
# Import Steps 15-16 functionality from answer synthesis module
|
| 29 |
+
from query_graph_functions.answer_synthesis import AnswerSynthesisEngine
|
| 30 |
+
# Import Steps 17-20 functionality from response management module
|
| 31 |
+
from query_graph_functions.response_management import ResponseManager
|
| 32 |
+
from my_config import MY_CONFIG
|
| 33 |
+
import query_utils
|
| 34 |
+
|
| 35 |
+
# Configure environment
|
| 36 |
+
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
|
| 37 |
+
|
| 38 |
+
# Load environment variables
|
| 39 |
+
load_dotenv()
|
| 40 |
+
|
| 41 |
+
# Create logs directory if it doesn't exist
|
| 42 |
+
os.makedirs('logs/chainlit', exist_ok=True)
|
| 43 |
+
|
| 44 |
+
# Configure logging - Save to file and console
|
| 45 |
+
logging.basicConfig(
|
| 46 |
+
level=logging.INFO,
|
| 47 |
+
format='%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s',
|
| 48 |
+
handlers=[
|
| 49 |
+
logging.FileHandler('logs/chainlit/chainlit_graph.log', mode='a'),
|
| 50 |
+
logging.StreamHandler()
|
| 51 |
+
],
|
| 52 |
+
force=True
|
| 53 |
+
)
|
| 54 |
+
logger = logging.getLogger(__name__)
|
| 55 |
+
logger.setLevel(logging.INFO)
|
| 56 |
+
|
| 57 |
+
# Log session start
|
| 58 |
+
logger.info("=" * 80)
|
| 59 |
+
logger.info(f"Chainlit GraphRAG Session Started - {time.strftime('%Y-%m-%d %H:%M:%S')}")
|
| 60 |
+
logger.info("=" * 80)
|
| 61 |
+
|
| 62 |
+
# Global GraphRAG engine instance
|
| 63 |
+
graph_engine = None
|
| 64 |
+
initialization_complete = False
|
| 65 |
+
|
| 66 |
+
def initialize():
|
| 67 |
+
global graph_engine, initialization_complete
|
| 68 |
+
|
| 69 |
+
if initialization_complete:
|
| 70 |
+
return
|
| 71 |
+
|
| 72 |
+
logger.info("Initializing GraphRAG system...")
|
| 73 |
+
|
| 74 |
+
try:
|
| 75 |
+
# Initialize setup module (Step 1)
|
| 76 |
+
setup = create_graphrag_setup()
|
| 77 |
+
|
| 78 |
+
# Create GraphRAG engine wrapper
|
| 79 |
+
class GraphQueryEngine:
|
| 80 |
+
def __init__(self, setup):
|
| 81 |
+
self.setup = setup
|
| 82 |
+
self.neo4j_conn = setup.neo4j_conn
|
| 83 |
+
self.query_engine = setup.query_engine
|
| 84 |
+
self.graph_stats = setup.graph_stats
|
| 85 |
+
self.drift_config = setup.drift_config
|
| 86 |
+
self.llm = setup.llm
|
| 87 |
+
self.config = setup.config
|
| 88 |
+
self.query_preprocessor = None
|
| 89 |
+
self.response_manager = ResponseManager(setup)
|
| 90 |
+
|
| 91 |
+
async def run_query_async(self, user_query):
|
| 92 |
+
"""Execute GraphRAG query pipeline (Steps 3-20)"""
|
| 93 |
+
start_time = time.time()
|
| 94 |
+
|
| 95 |
+
optimized_query = query_utils.tweak_query(user_query, MY_CONFIG.LLM_MODEL)
|
| 96 |
+
|
| 97 |
+
try:
|
| 98 |
+
if not self.setup.validate_system_readiness():
|
| 99 |
+
return {"answer": "System not ready", "metadata": {}}
|
| 100 |
+
|
| 101 |
+
# Initialize query preprocessor if needed
|
| 102 |
+
if not self.query_preprocessor:
|
| 103 |
+
self.query_preprocessor = await create_query_preprocessor(
|
| 104 |
+
self.config, self.graph_stats
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
# Phase B: Query Preprocessing (Steps 3-5)
|
| 108 |
+
analysis, routing, vectorization = await preprocess_query_pipeline(
|
| 109 |
+
optimized_query, self.config, self.graph_stats
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
# Phase C: Community Retrieval (Steps 6-7)
|
| 113 |
+
community_engine = CommunitySearchEngine(self.setup)
|
| 114 |
+
community_results = await community_engine.execute_primer_phase(
|
| 115 |
+
vectorization.embedding, routing
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
# Phase D: Follow-up Search (Steps 9-12)
|
| 119 |
+
follow_up_engine = FollowUpSearch(self.setup)
|
| 120 |
+
follow_up_results = await follow_up_engine.execute_follow_up_phase(
|
| 121 |
+
community_results, routing
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
# Phase E: Vector Search Augmentation (Steps 13-14)
|
| 125 |
+
vector_engine = VectorAugmentationEngine(self.setup)
|
| 126 |
+
augmentation_results = await vector_engine.execute_vector_augmentation_phase(
|
| 127 |
+
vectorization.embedding,
|
| 128 |
+
{'communities': community_results['communities'],
|
| 129 |
+
'initial_answer': community_results['initial_answer'],
|
| 130 |
+
'follow_up_results': follow_up_results},
|
| 131 |
+
routing
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
# Phase F: Answer Synthesis (Steps 15-16)
|
| 135 |
+
synthesis_engine = AnswerSynthesisEngine(self.setup)
|
| 136 |
+
synthesis_results = await synthesis_engine.execute_answer_synthesis_phase(
|
| 137 |
+
analysis, routing, community_results, follow_up_results, augmentation_results
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
total_time = time.time() - start_time
|
| 141 |
+
|
| 142 |
+
# Generate metadata
|
| 143 |
+
metadata = self.response_manager.generate_comprehensive_metadata(
|
| 144 |
+
analysis=analysis,
|
| 145 |
+
routing=routing,
|
| 146 |
+
vectorization=vectorization,
|
| 147 |
+
community_results=community_results,
|
| 148 |
+
follow_up_results=follow_up_results,
|
| 149 |
+
augmentation_results=augmentation_results,
|
| 150 |
+
synthesis_results=synthesis_results,
|
| 151 |
+
total_time=total_time
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
# Cleanup async tasks
|
| 155 |
+
await self.setup.cleanup_async_tasks(timeout=2.0)
|
| 156 |
+
|
| 157 |
+
return {
|
| 158 |
+
"answer": synthesis_results.final_answer,
|
| 159 |
+
"metadata": metadata,
|
| 160 |
+
"analysis": analysis,
|
| 161 |
+
"routing": routing,
|
| 162 |
+
"community_results": community_results,
|
| 163 |
+
"follow_up_results": follow_up_results,
|
| 164 |
+
"augmentation_results": augmentation_results
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
except Exception as e:
|
| 168 |
+
logger.error(f"Query pipeline error: {e}")
|
| 169 |
+
synthesis_engine = AnswerSynthesisEngine(self.setup)
|
| 170 |
+
return synthesis_engine.generate_error_response(f"Query error: {e}")
|
| 171 |
+
|
| 172 |
+
graph_engine = GraphQueryEngine(setup)
|
| 173 |
+
|
| 174 |
+
logger.info("✅ GraphRAG system initialized")
|
| 175 |
+
logger.info(f"✅ Using LLM: {MY_CONFIG.LLM_MODEL}")
|
| 176 |
+
logger.info(f"✅ Using embedding: {MY_CONFIG.EMBEDDING_MODEL}")
|
| 177 |
+
|
| 178 |
+
initialization_complete = True
|
| 179 |
+
|
| 180 |
+
except Exception as e:
|
| 181 |
+
initialization_complete = False
|
| 182 |
+
logger.error(f"GraphRAG initialization error: {str(e)}")
|
| 183 |
+
raise
|
| 184 |
+
|
| 185 |
+
def extract_thinking_section(response_text):
|
| 186 |
+
"""
|
| 187 |
+
Extract thinking section from LLM response if present.
|
| 188 |
+
|
| 189 |
+
Args:
|
| 190 |
+
response_text (str): The full response from the LLM
|
| 191 |
+
|
| 192 |
+
Returns:
|
| 193 |
+
tuple: (thinking_content, cleaned_response)
|
| 194 |
+
- thinking_content: Content within <think></think> tags or None if not found
|
| 195 |
+
- cleaned_response: Response with thinking section removed
|
| 196 |
+
"""
|
| 197 |
+
thinking_pattern = r'<think>(.*?)</think>'
|
| 198 |
+
match = re.search(thinking_pattern, response_text, re.DOTALL)
|
| 199 |
+
|
| 200 |
+
if match:
|
| 201 |
+
thinking_content = match.group(1).strip()
|
| 202 |
+
cleaned_response = re.sub(thinking_pattern, '', response_text, flags=re.DOTALL).strip()
|
| 203 |
+
return thinking_content, cleaned_response
|
| 204 |
+
else:
|
| 205 |
+
return None, response_text
|
| 206 |
+
|
| 207 |
+
async def get_llm_response(message):
|
| 208 |
+
"""
|
| 209 |
+
Process user message
|
| 210 |
+
"""
|
| 211 |
+
global graph_engine, initialization_complete
|
| 212 |
+
|
| 213 |
+
if not initialization_complete or graph_engine is None:
|
| 214 |
+
return "System not initialized. Please try again later.", 0
|
| 215 |
+
|
| 216 |
+
start_time = time.time()
|
| 217 |
+
|
| 218 |
+
try:
|
| 219 |
+
# Step 1: Query Preprocessing
|
| 220 |
+
async with cl.Step(name="Query Analysis", type="tool") as step:
|
| 221 |
+
step.input = message
|
| 222 |
+
optimized_query = query_utils.tweak_query(message, MY_CONFIG.LLM_MODEL)
|
| 223 |
+
step.output = f"Optimized query: {optimized_query}"
|
| 224 |
+
|
| 225 |
+
# Execute GraphRAG query pipeline
|
| 226 |
+
result = await graph_engine.run_query_async(message)
|
| 227 |
+
|
| 228 |
+
# Step 2: Community Search
|
| 229 |
+
if 'community_results' in result:
|
| 230 |
+
async with cl.Step(name="Community Retrieval", type="retrieval") as step:
|
| 231 |
+
communities = result['community_results'].get('communities', [])
|
| 232 |
+
step.input = "Searching graph communities"
|
| 233 |
+
step.output = f"Found {len(communities)} relevant communities"
|
| 234 |
+
|
| 235 |
+
# Step 3: Follow-up Search
|
| 236 |
+
if 'follow_up_results' in result:
|
| 237 |
+
async with cl.Step(name="Entity Search", type="retrieval") as step:
|
| 238 |
+
step.input = "Analyzing entity relationships"
|
| 239 |
+
follow_up = result['follow_up_results']
|
| 240 |
+
|
| 241 |
+
entities_found = len(follow_up.get('detailed_entities', []))
|
| 242 |
+
relationships_found = sum(~
|
| 243 |
+
len(search.traversed_relationships)
|
| 244 |
+
for search in follow_up.get('local_search_results', [])
|
| 245 |
+
)
|
| 246 |
+
step.output = f"Entities: {entities_found}, Relationships: {relationships_found}"
|
| 247 |
+
|
| 248 |
+
# Step 4: Vector Augmentation
|
| 249 |
+
if 'augmentation_results' in result:
|
| 250 |
+
async with cl.Step(name="Document Augmentation", type="retrieval") as step:
|
| 251 |
+
step.input = "Enriching with vector search"
|
| 252 |
+
aug_results = result['augmentation_results']
|
| 253 |
+
|
| 254 |
+
if hasattr(aug_results, 'vector_results'):
|
| 255 |
+
chunks = aug_results.vector_results
|
| 256 |
+
step.output = f"Retrieved {len(chunks)} relevant document chunks"
|
| 257 |
+
else:
|
| 258 |
+
step.output = "Vector augmentation completed"
|
| 259 |
+
|
| 260 |
+
# Extract answer and timing
|
| 261 |
+
full_response = result.get('answer', 'No response generated')
|
| 262 |
+
|
| 263 |
+
# Filter out metadata section more robustly
|
| 264 |
+
lines = full_response.split('\n')
|
| 265 |
+
filtered_lines = []
|
| 266 |
+
|
| 267 |
+
for line in lines:
|
| 268 |
+
stripped = line.strip()
|
| 269 |
+
# Skip these metadata lines completely
|
| 270 |
+
if (stripped.startswith('## Comprehensive Answer') or
|
| 271 |
+
stripped.startswith('# Comprehensive Answer') or
|
| 272 |
+
stripped.startswith('---') or
|
| 273 |
+
stripped.startswith('**Answer Confidence**:') or
|
| 274 |
+
stripped.startswith('**Sources Integrated**:') or
|
| 275 |
+
stripped.startswith('**Multi-Phase Coverage**:')):
|
| 276 |
+
continue
|
| 277 |
+
|
| 278 |
+
filtered_lines.append(line)
|
| 279 |
+
|
| 280 |
+
response_text = '\n'.join(filtered_lines).strip()
|
| 281 |
+
|
| 282 |
+
# Extract thinking section if present
|
| 283 |
+
thinking_content, cleaned_response = extract_thinking_section(response_text)
|
| 284 |
+
|
| 285 |
+
# Step 5: Optional Thinking Process
|
| 286 |
+
if thinking_content:
|
| 287 |
+
async with cl.Step(name="Reasoning Process", type="run") as step:
|
| 288 |
+
step.input = ""
|
| 289 |
+
step.output = thinking_content
|
| 290 |
+
logger.info(f"Thinking:\n{thinking_content[:200]}...")
|
| 291 |
+
|
| 292 |
+
# Step 6: Final Answer
|
| 293 |
+
async with cl.Step(name="Synthesis", type="llm") as step:
|
| 294 |
+
step.input = "Generating comprehensive answer"
|
| 295 |
+
step.output = cleaned_response if cleaned_response else response_text
|
| 296 |
+
|
| 297 |
+
end_time = time.time()
|
| 298 |
+
elapsed_time = end_time - start_time
|
| 299 |
+
|
| 300 |
+
return cleaned_response if cleaned_response else response_text, elapsed_time
|
| 301 |
+
|
| 302 |
+
except Exception as e:
|
| 303 |
+
logger.error(f"Error processing query: {str(e)}")
|
| 304 |
+
return f"Sorry, I encountered an error:\n{str(e)}", 0
|
| 305 |
+
|
| 306 |
+
# ====== CHAINLIT SPECIFIC CODE ======
|
| 307 |
+
|
| 308 |
+
@cl.set_starters
|
| 309 |
+
async def set_starters():
|
| 310 |
+
starters = []
|
| 311 |
+
for prompt in MY_CONFIG.STARTER_PROMPTS:
|
| 312 |
+
starters.append(
|
| 313 |
+
cl.Starter(
|
| 314 |
+
label=prompt.strip(),
|
| 315 |
+
message=prompt.strip(),
|
| 316 |
+
)
|
| 317 |
+
)
|
| 318 |
+
return starters
|
| 319 |
+
## --- end: def set_starters(): ---
|
| 320 |
+
|
| 321 |
+
@cl.on_chat_start
|
| 322 |
+
async def start():
|
| 323 |
+
"""Initialize the chat session"""
|
| 324 |
+
# Store initialization state in user session
|
| 325 |
+
cl.user_session.set("chat_started", True)
|
| 326 |
+
logger.info("User chat session started")
|
| 327 |
+
init_error = None
|
| 328 |
+
|
| 329 |
+
try:
|
| 330 |
+
initialize()
|
| 331 |
+
# await cl.Message(content="How can I assist you today?").send()
|
| 332 |
+
except Exception as e:
|
| 333 |
+
init_error = str(e)
|
| 334 |
+
error_msg = f"""System Initialization Error
|
| 335 |
+
|
| 336 |
+
The system failed to initialize with the following error:
|
| 337 |
+
|
| 338 |
+
```
|
| 339 |
+
{init_error}
|
| 340 |
+
```
|
| 341 |
+
|
| 342 |
+
Please check your configuration and environment variables."""
|
| 343 |
+
await cl.Message(content=error_msg).send()
|
| 344 |
+
|
| 345 |
+
@cl.on_message
|
| 346 |
+
async def main(message: cl.Message):
|
| 347 |
+
"""Handle incoming messages"""
|
| 348 |
+
user_message = message.content
|
| 349 |
+
|
| 350 |
+
# Get response from LLM with RAG steps shown FIRST
|
| 351 |
+
response_text, elapsed_time = await get_llm_response(user_message)
|
| 352 |
+
# logger.info(f"LLM Response:\n{response_text[:200]}...") # Log first 200 chars
|
| 353 |
+
|
| 354 |
+
thinking_content, cleaned_response = extract_thinking_section(response_text)
|
| 355 |
+
|
| 356 |
+
# Add timing stat to response
|
| 357 |
+
full_response = cleaned_response + f"\n\n⏱️ *Total time: {elapsed_time:.1f} seconds*"
|
| 358 |
+
|
| 359 |
+
# THEN create a new message for streaming
|
| 360 |
+
msg = cl.Message(content="")
|
| 361 |
+
await msg.send()
|
| 362 |
+
|
| 363 |
+
# Stream the response character by character for better UX
|
| 364 |
+
# This simulates streaming - in a real implementation you'd stream from the LLM
|
| 365 |
+
for i in range(0, len(full_response), 5): # Stream in chunks of 5 characters
|
| 366 |
+
await msg.stream_token(full_response[i:i+5])
|
| 367 |
+
await asyncio.sleep(0.01) # Small delay for visual effect
|
| 368 |
+
|
| 369 |
+
# Update the final message
|
| 370 |
+
msg.content = full_response
|
| 371 |
+
await msg.update()
|
| 372 |
+
|
| 373 |
+
## -------
|
| 374 |
+
if __name__ == '__main__':
|
| 375 |
+
logger.info("App starting up...")
|
app_flask.py
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from flask import Flask, g, render_template, request, jsonify
|
| 2 |
+
import os
|
| 3 |
+
import logging
|
| 4 |
+
import time
|
| 5 |
+
|
| 6 |
+
# Import llama-index and related libraries
|
| 7 |
+
from llama_index.core import VectorStoreIndex, StorageContext
|
| 8 |
+
from llama_index.vector_stores.milvus import MilvusVectorStore
|
| 9 |
+
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
| 10 |
+
from llama_index.core import Settings
|
| 11 |
+
from llama_index.llms.litellm import LiteLLM
|
| 12 |
+
from my_config import MY_CONFIG
|
| 13 |
+
import query_utils
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
os.environ['HF_ENDPOINT'] = MY_CONFIG.HF_ENDPOINT
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
app = Flask(__name__)
|
| 21 |
+
|
| 22 |
+
# Global variables for LLM and index
|
| 23 |
+
vector_index = None
|
| 24 |
+
|
| 25 |
+
initialization_complete = False
|
| 26 |
+
def initialize():
|
| 27 |
+
"""
|
| 28 |
+
Initialize LLM and Milvus vector database using llama-index.
|
| 29 |
+
This function sets up the necessary components for the chat application.
|
| 30 |
+
"""
|
| 31 |
+
global vector_index, initialization_complete
|
| 32 |
+
|
| 33 |
+
if initialization_complete:
|
| 34 |
+
return
|
| 35 |
+
|
| 36 |
+
logging.info("Initializing LLM and vector database...")
|
| 37 |
+
|
| 38 |
+
# raise Exception ("init exception test") # debug
|
| 39 |
+
|
| 40 |
+
try:
|
| 41 |
+
## embedding model
|
| 42 |
+
Settings.embed_model = HuggingFaceEmbedding(
|
| 43 |
+
model_name = MY_CONFIG.EMBEDDING_MODEL
|
| 44 |
+
)
|
| 45 |
+
print("✅ Using embedding model: ", MY_CONFIG.EMBEDDING_MODEL)
|
| 46 |
+
|
| 47 |
+
# Setup LLM using LiteLLM
|
| 48 |
+
llm = LiteLLM(
|
| 49 |
+
model=MY_CONFIG.LLM_MODEL,
|
| 50 |
+
temperature=0.1
|
| 51 |
+
)
|
| 52 |
+
print("✅ LLM run environment: ", MY_CONFIG.LLM_RUN_ENV)
|
| 53 |
+
print("✅ Using LLM model : ", MY_CONFIG.LLM_MODEL)
|
| 54 |
+
Settings.llm = llm
|
| 55 |
+
|
| 56 |
+
# Initialize Milvus vector store for Vector RAG only
|
| 57 |
+
vector_store = MilvusVectorStore(
|
| 58 |
+
uri = MY_CONFIG.MILVUS_URI_VECTOR , # Use dedicated Vector-only database
|
| 59 |
+
dim = MY_CONFIG.EMBEDDING_LENGTH ,
|
| 60 |
+
collection_name = MY_CONFIG.COLLECTION_NAME,
|
| 61 |
+
overwrite=False # so we load the index from db
|
| 62 |
+
)
|
| 63 |
+
storage_context = StorageContext.from_defaults(vector_store=vector_store)
|
| 64 |
+
print ("✅ Connected to Vector-only Milvus instance: ", MY_CONFIG.MILVUS_URI_VECTOR )
|
| 65 |
+
|
| 66 |
+
vector_index = VectorStoreIndex.from_vector_store(
|
| 67 |
+
vector_store=vector_store, storage_context=storage_context)
|
| 68 |
+
print ("✅ Loaded Vector-only index from:", MY_CONFIG.MILVUS_URI_VECTOR )
|
| 69 |
+
|
| 70 |
+
logging.info("Successfully initialized LLM and vector database")
|
| 71 |
+
|
| 72 |
+
initialization_complete = True
|
| 73 |
+
except Exception as e:
|
| 74 |
+
initialization_complete = False
|
| 75 |
+
logging.error(f"Error initializing LLM and vector database: {str(e)}")
|
| 76 |
+
raise (e)
|
| 77 |
+
# return False
|
| 78 |
+
## -------------
|
| 79 |
+
|
| 80 |
+
## ----
|
| 81 |
+
@app.route('/')
|
| 82 |
+
def index():
|
| 83 |
+
init_error = app.config.get('INIT_ERROR', '')
|
| 84 |
+
# init_error = g.get('init_error', None)
|
| 85 |
+
return render_template('index.html', init_error=init_error)
|
| 86 |
+
## end --- def index():
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
## -----
|
| 90 |
+
@app.route('/chat', methods=['POST'])
|
| 91 |
+
def chat():
|
| 92 |
+
user_message = request.json.get('message')
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
# Get response from LLM
|
| 96 |
+
response = get_llm_response(user_message)
|
| 97 |
+
# print (response)
|
| 98 |
+
|
| 99 |
+
return jsonify({'response': response})
|
| 100 |
+
## end : def chat():
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def get_llm_response(message):
|
| 104 |
+
"""
|
| 105 |
+
Process the user message and get a response from the LLM using Vector RAG
|
| 106 |
+
with structured prompting
|
| 107 |
+
"""
|
| 108 |
+
global vector_index, initialization_complete
|
| 109 |
+
|
| 110 |
+
# Check if LLM and index are initialized
|
| 111 |
+
if vector_index is None or initialization_complete is None:
|
| 112 |
+
return "System did not initialize. Please try again later."
|
| 113 |
+
|
| 114 |
+
start_time = time.time()
|
| 115 |
+
response_text = ''
|
| 116 |
+
|
| 117 |
+
try:
|
| 118 |
+
# raise Exception ("chat exception test") ## debug
|
| 119 |
+
# Create a query engine from the index
|
| 120 |
+
query_engine = vector_index.as_query_engine()
|
| 121 |
+
|
| 122 |
+
# Apply query optimization
|
| 123 |
+
message = query_utils.tweak_query(message, MY_CONFIG.LLM_MODEL)
|
| 124 |
+
|
| 125 |
+
# Get initial vector response
|
| 126 |
+
vector_response = query_engine.query(message)
|
| 127 |
+
vector_text = str(vector_response).strip()
|
| 128 |
+
|
| 129 |
+
# Structured prompt
|
| 130 |
+
structured_prompt = f"""Please provide a comprehensive, well-structured answer using the provided document information.
|
| 131 |
+
|
| 132 |
+
Question: {message}
|
| 133 |
+
|
| 134 |
+
Document Information:
|
| 135 |
+
{vector_text}
|
| 136 |
+
|
| 137 |
+
Instructions:
|
| 138 |
+
1. Provide accurate, factual information based on the documents
|
| 139 |
+
2. Structure your response clearly with proper formatting
|
| 140 |
+
3. Be comprehensive yet concise
|
| 141 |
+
4. Highlight key relationships and important details when relevant
|
| 142 |
+
5. Use bullet points or sections when appropriate for clarity
|
| 143 |
+
|
| 144 |
+
Please provide your answer:"""
|
| 145 |
+
|
| 146 |
+
# Use structured prompt for final synthesis
|
| 147 |
+
final_response = query_engine.query(structured_prompt)
|
| 148 |
+
|
| 149 |
+
if final_response:
|
| 150 |
+
response_text = str(final_response).strip()
|
| 151 |
+
|
| 152 |
+
except Exception as e:
|
| 153 |
+
logging.error(f"Error getting LLM response: {str(e)}")
|
| 154 |
+
response_text = f"Sorry, I encountered an error while processing your request:\n{str(e)}"
|
| 155 |
+
|
| 156 |
+
end_time = time.time()
|
| 157 |
+
|
| 158 |
+
# add timing stat
|
| 159 |
+
response_text += f"\n⏱️ *Total time: {(end_time - start_time):.1f} seconds*"
|
| 160 |
+
return response_text
|
| 161 |
+
|
| 162 |
+
## --- end: def get_llm_response():
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
## -------
|
| 168 |
+
if __name__ == '__main__':
|
| 169 |
+
# Configure logging
|
| 170 |
+
logging.basicConfig(
|
| 171 |
+
level=logging.INFO,
|
| 172 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 173 |
+
)
|
| 174 |
+
logging.info("App starting up...")
|
| 175 |
+
|
| 176 |
+
# Initialize LLM and vector database
|
| 177 |
+
try:
|
| 178 |
+
initialize()
|
| 179 |
+
except Exception as e:
|
| 180 |
+
logging.warning("Starting without LLM and vector database. Responses will be limited.")
|
| 181 |
+
app.config['INIT_ERROR'] = str(e)
|
| 182 |
+
# g.init_error = str(e)
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
# Vector RAG Flask App - Configurable port via environment
|
| 186 |
+
PORT = MY_CONFIG.FLASK_VECTOR_PORT
|
| 187 |
+
print(f"🚀 Vector RAG Flask app starting on port {PORT}")
|
| 188 |
+
app.run(host="0.0.0.0", debug=False, port=PORT)
|
| 189 |
+
## -- end main ----
|
app_flask_graph.py
ADDED
|
@@ -0,0 +1,264 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
GraphRAG Flask Web Application
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from flask import Flask, render_template, request, jsonify
|
| 6 |
+
import os
|
| 7 |
+
import logging
|
| 8 |
+
import time
|
| 9 |
+
import asyncio
|
| 10 |
+
import nest_asyncio
|
| 11 |
+
from query_graph_functions.setup import create_graphrag_setup
|
| 12 |
+
from query_graph_functions.query_preprocessing import create_query_preprocessor, preprocess_query_pipeline
|
| 13 |
+
from query_graph_functions.knowledge_retrieval import CommunitySearchEngine
|
| 14 |
+
from query_graph_functions.follow_up_search import FollowUpSearch
|
| 15 |
+
from query_graph_functions.vector_augmentation import VectorAugmentationEngine
|
| 16 |
+
from query_graph_functions.answer_synthesis import AnswerSynthesisEngine
|
| 17 |
+
from query_graph_functions.response_management import ResponseManager
|
| 18 |
+
from my_config import MY_CONFIG
|
| 19 |
+
import query_utils
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
nest_asyncio.apply()
|
| 23 |
+
os.environ['HF_ENDPOINT'] = MY_CONFIG.HF_ENDPOINT
|
| 24 |
+
|
| 25 |
+
app = Flask(__name__)
|
| 26 |
+
|
| 27 |
+
# Global GraphRAG engine
|
| 28 |
+
graph_engine = None
|
| 29 |
+
initialization_complete = False
|
| 30 |
+
|
| 31 |
+
def initialize():
|
| 32 |
+
"""
|
| 33 |
+
Initialize GraphRAG system
|
| 34 |
+
"""
|
| 35 |
+
global graph_engine, initialization_complete
|
| 36 |
+
|
| 37 |
+
if initialization_complete:
|
| 38 |
+
return
|
| 39 |
+
|
| 40 |
+
logging.info("Initializing GraphRAG system...")
|
| 41 |
+
|
| 42 |
+
try:
|
| 43 |
+
# Initialize setup module (Step 1)
|
| 44 |
+
setup = create_graphrag_setup()
|
| 45 |
+
|
| 46 |
+
# Create GraphRAG engine wrapper
|
| 47 |
+
class GraphQueryEngine:
|
| 48 |
+
def __init__(self, setup):
|
| 49 |
+
self.setup = setup
|
| 50 |
+
self.neo4j_conn = setup.neo4j_conn
|
| 51 |
+
self.query_engine = setup.query_engine
|
| 52 |
+
self.graph_stats = setup.graph_stats
|
| 53 |
+
self.drift_config = setup.drift_config
|
| 54 |
+
self.llm = setup.llm
|
| 55 |
+
self.config = setup.config
|
| 56 |
+
self.query_preprocessor = None
|
| 57 |
+
self.response_manager = ResponseManager(setup)
|
| 58 |
+
|
| 59 |
+
async def run_query_async(self, user_query):
|
| 60 |
+
"""Execute GraphRAG query pipeline (Steps 3-20)"""
|
| 61 |
+
start_time = time.time()
|
| 62 |
+
|
| 63 |
+
optimized_query = query_utils.tweak_query(user_query, MY_CONFIG.LLM_MODEL)
|
| 64 |
+
|
| 65 |
+
try:
|
| 66 |
+
if not self.setup.validate_system_readiness():
|
| 67 |
+
return {"answer": "System not ready", "metadata": {}}
|
| 68 |
+
|
| 69 |
+
# Initialize query preprocessor if needed
|
| 70 |
+
if not self.query_preprocessor:
|
| 71 |
+
self.query_preprocessor = await create_query_preprocessor(
|
| 72 |
+
self.config, self.graph_stats
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
# Phase B: Query Preprocessing (Steps 3-5)
|
| 76 |
+
analysis, routing, vectorization = await preprocess_query_pipeline(
|
| 77 |
+
optimized_query, self.config, self.graph_stats
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
# Phase C: Community Retrieval (Steps 6-7)
|
| 81 |
+
community_engine = CommunitySearchEngine(self.setup)
|
| 82 |
+
community_results = await community_engine.execute_primer_phase(
|
| 83 |
+
vectorization.embedding, routing
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
# Phase D: Follow-up Search (Steps 9-12)
|
| 87 |
+
follow_up_engine = FollowUpSearch(self.setup)
|
| 88 |
+
follow_up_results = await follow_up_engine.execute_follow_up_phase(
|
| 89 |
+
community_results, routing
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
# Phase E: Vector Search Augmentation (Steps 13-14)
|
| 93 |
+
vector_engine = VectorAugmentationEngine(self.setup)
|
| 94 |
+
augmentation_results = await vector_engine.execute_vector_augmentation_phase(
|
| 95 |
+
vectorization.embedding,
|
| 96 |
+
{'communities': community_results['communities'],
|
| 97 |
+
'initial_answer': community_results['initial_answer'],
|
| 98 |
+
'follow_up_results': follow_up_results},
|
| 99 |
+
routing
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
# Phase F: Answer Synthesis (Steps 15-16)
|
| 103 |
+
synthesis_engine = AnswerSynthesisEngine(self.setup)
|
| 104 |
+
synthesis_results = await synthesis_engine.execute_answer_synthesis_phase(
|
| 105 |
+
analysis, routing, community_results, follow_up_results, augmentation_results
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
total_time = time.time() - start_time
|
| 109 |
+
|
| 110 |
+
# Generate metadata
|
| 111 |
+
metadata = self.response_manager.generate_comprehensive_metadata(
|
| 112 |
+
analysis=analysis,
|
| 113 |
+
routing=routing,
|
| 114 |
+
vectorization=vectorization,
|
| 115 |
+
community_results=community_results,
|
| 116 |
+
follow_up_results=follow_up_results,
|
| 117 |
+
augmentation_results=augmentation_results,
|
| 118 |
+
synthesis_results=synthesis_results,
|
| 119 |
+
total_time=total_time
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
# Cleanup async tasks
|
| 123 |
+
await self.setup.cleanup_async_tasks(timeout=2.0)
|
| 124 |
+
|
| 125 |
+
return {
|
| 126 |
+
"answer": synthesis_results.final_answer,
|
| 127 |
+
"metadata": metadata
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
except Exception as e:
|
| 131 |
+
logging.error(f"Query pipeline error: {e}")
|
| 132 |
+
synthesis_engine = AnswerSynthesisEngine(self.setup)
|
| 133 |
+
return synthesis_engine.generate_error_response(f"Query error: {e}")
|
| 134 |
+
|
| 135 |
+
def run_query(self, user_query):
|
| 136 |
+
"""Synchronous wrapper for async query"""
|
| 137 |
+
try:
|
| 138 |
+
loop = asyncio.get_event_loop()
|
| 139 |
+
return loop.run_until_complete(self.run_query_async(user_query))
|
| 140 |
+
except Exception as e:
|
| 141 |
+
logging.error(f"Query execution error: {e}")
|
| 142 |
+
return {"answer": f"Error: {e}", "metadata": {}}
|
| 143 |
+
|
| 144 |
+
graph_engine = GraphQueryEngine(setup)
|
| 145 |
+
|
| 146 |
+
print("✅ GraphRAG system initialized")
|
| 147 |
+
print(f"✅ Using LLM: {MY_CONFIG.LLM_MODEL}")
|
| 148 |
+
print(f"✅ Using embedding: {MY_CONFIG.EMBEDDING_MODEL}")
|
| 149 |
+
|
| 150 |
+
logging.info("GraphRAG system ready")
|
| 151 |
+
initialization_complete = True
|
| 152 |
+
|
| 153 |
+
except Exception as e:
|
| 154 |
+
initialization_complete = False
|
| 155 |
+
logging.error(f"GraphRAG initialization error: {str(e)}")
|
| 156 |
+
raise
|
| 157 |
+
|
| 158 |
+
## ----
|
| 159 |
+
@app.route('/')
|
| 160 |
+
def index():
|
| 161 |
+
init_error = app.config.get('INIT_ERROR', '')
|
| 162 |
+
# init_error = g.get('init_error', None)
|
| 163 |
+
return render_template('index.html', init_error=init_error)
|
| 164 |
+
## end --- def index():
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
## ----
|
| 168 |
+
@app.route('/health')
|
| 169 |
+
def health():
|
| 170 |
+
"""Health check endpoint for deployment platforms"""
|
| 171 |
+
if initialization_complete:
|
| 172 |
+
return jsonify({"status": "healthy", "graphrag": "initialized"}), 200
|
| 173 |
+
else:
|
| 174 |
+
return jsonify({"status": "initializing"}), 503
|
| 175 |
+
## end --- def health():
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
## -----
|
| 179 |
+
@app.route('/chat', methods=['POST'])
|
| 180 |
+
def chat():
|
| 181 |
+
user_message = request.json.get('message')
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
# Get response from LLM
|
| 185 |
+
response = get_llm_response(user_message)
|
| 186 |
+
# print (response)
|
| 187 |
+
|
| 188 |
+
return jsonify({'response': response})
|
| 189 |
+
## end : def chat():
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
def get_llm_response(message):
|
| 193 |
+
"""
|
| 194 |
+
Process user message using complete GraphRAG pipeline.
|
| 195 |
+
Implements the full 25-step DRIFT search methodology.
|
| 196 |
+
"""
|
| 197 |
+
global graph_engine, initialization_complete
|
| 198 |
+
|
| 199 |
+
if not initialization_complete or graph_engine is None:
|
| 200 |
+
return "System not initialized. Please try again later."
|
| 201 |
+
|
| 202 |
+
start_time = time.time()
|
| 203 |
+
|
| 204 |
+
try:
|
| 205 |
+
# Execute GraphRAG query pipeline
|
| 206 |
+
result = graph_engine.run_query(message)
|
| 207 |
+
|
| 208 |
+
# Extract answer and timing
|
| 209 |
+
full_response = result.get('answer', 'No response generated')
|
| 210 |
+
|
| 211 |
+
# Filter out metadata section more robustly
|
| 212 |
+
lines = full_response.split('\n')
|
| 213 |
+
filtered_lines = []
|
| 214 |
+
|
| 215 |
+
for line in lines:
|
| 216 |
+
stripped = line.strip()
|
| 217 |
+
# Skip these metadata lines completely
|
| 218 |
+
if (stripped.startswith('## Comprehensive Answer') or
|
| 219 |
+
stripped.startswith('# Comprehensive Answer') or
|
| 220 |
+
stripped.startswith('---') or
|
| 221 |
+
stripped.startswith('**Answer Confidence**:') or
|
| 222 |
+
stripped.startswith('**Sources Integrated**:') or
|
| 223 |
+
stripped.startswith('**Multi-Phase Coverage**:')):
|
| 224 |
+
continue
|
| 225 |
+
|
| 226 |
+
filtered_lines.append(line)
|
| 227 |
+
|
| 228 |
+
response_text = '\n'.join(filtered_lines).strip()
|
| 229 |
+
end_time = time.time()
|
| 230 |
+
|
| 231 |
+
# Add timing information
|
| 232 |
+
response_text += f"\n\n⏱️ *Total time: {(end_time - start_time):.1f} seconds*"
|
| 233 |
+
|
| 234 |
+
return response_text
|
| 235 |
+
|
| 236 |
+
except Exception as e:
|
| 237 |
+
logging.error(f"Error processing query: {str(e)}")
|
| 238 |
+
return f"Sorry, I encountered an error:\n{str(e)}"
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
## -------
|
| 244 |
+
if __name__ == '__main__':
|
| 245 |
+
# Configure logging
|
| 246 |
+
logging.basicConfig(
|
| 247 |
+
level=logging.INFO,
|
| 248 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 249 |
+
)
|
| 250 |
+
logging.info("App starting up...")
|
| 251 |
+
|
| 252 |
+
# Initialize LLM and vector database
|
| 253 |
+
try:
|
| 254 |
+
initialize()
|
| 255 |
+
except Exception as e:
|
| 256 |
+
logging.warning("Starting without LLM and vector database. Responses will be limited.")
|
| 257 |
+
app.config['INIT_ERROR'] = str(e)
|
| 258 |
+
# g.init_error = str(e)
|
| 259 |
+
|
| 260 |
+
|
| 261 |
+
# GraphRAG Flask App - Configurable port via environment
|
| 262 |
+
PORT = MY_CONFIG.FLASK_GRAPH_PORT
|
| 263 |
+
print(f"🚀 GraphRAG Flask app starting on port {PORT}")
|
| 264 |
+
app.run(host="0.0.0.0", debug=False, port=PORT)
|
chainlit.md
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Welcome to Chainlit! 🚀🤖
|
| 2 |
+
|
| 3 |
+
Hi there, Developer! 👋 We're excited to have you on board. Chainlit is a powerful tool designed to help you prototype, debug and share applications built on top of LLMs.
|
| 4 |
+
|
| 5 |
+
## Useful Links 🔗
|
| 6 |
+
|
| 7 |
+
- **Documentation:** Get started with our comprehensive [Chainlit Documentation](https://docs.chainlit.io) 📚
|
| 8 |
+
- **Discord Community:** Join our friendly [Chainlit Discord](https://discord.gg/k73SQ3FyUh) to ask questions, share your projects, and connect with other developers! 💬
|
| 9 |
+
|
| 10 |
+
We can't wait to see what you create with Chainlit! Happy coding! 💻😊
|
| 11 |
+
|
| 12 |
+
## Welcome screen
|
| 13 |
+
|
| 14 |
+
To modify the welcome screen, edit the `chainlit.md` file at the root of your project. If you do not want a welcome screen, just leave this file empty.
|
cleanup_pipeline_deps.sh
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# ============================================
|
| 3 |
+
# Cleanup Pipeline Dependencies Script
|
| 4 |
+
# ============================================
|
| 5 |
+
# This script removes heavy packages that are only needed for the pipeline
|
| 6 |
+
# to save ~350-500 MB of RAM in production Docker containers.
|
| 7 |
+
#
|
| 8 |
+
# Run this AFTER the pipeline completes successfully.
|
| 9 |
+
# ============================================
|
| 10 |
+
|
| 11 |
+
echo "============================================"
|
| 12 |
+
echo "Starting Pipeline Dependency Cleanup"
|
| 13 |
+
echo "============================================"
|
| 14 |
+
echo "This will remove packages only needed for:"
|
| 15 |
+
echo " - Document processing (docling, html2text)"
|
| 16 |
+
echo " - Graph community detection (igraph, leidenalg, etc.)"
|
| 17 |
+
echo " - Development tools (ipykernel, tqdm, etc.)"
|
| 18 |
+
echo ""
|
| 19 |
+
echo "Estimated RAM savings: 350-500 MB"
|
| 20 |
+
echo "============================================"
|
| 21 |
+
|
| 22 |
+
# Document processing packages
|
| 23 |
+
echo "Removing document processing packages..."
|
| 24 |
+
pip uninstall -y docling html2text 2>/dev/null || echo " (already removed or not installed)"
|
| 25 |
+
|
| 26 |
+
# Graph community detection packages
|
| 27 |
+
echo "Removing graph community detection packages..."
|
| 28 |
+
pip uninstall -y python-louvain igraph leidenalg graspologic 2>/dev/null || echo " (already removed or not installed)"
|
| 29 |
+
|
| 30 |
+
# Development tools
|
| 31 |
+
echo "Removing development tools..."
|
| 32 |
+
pip uninstall -y tqdm ipykernel fastmcp 2>/dev/null || echo " (already removed or not installed)"
|
| 33 |
+
|
| 34 |
+
# Milvus Lite (if using cloud Zilliz)
|
| 35 |
+
if [ "$VECTOR_DB_TYPE" = "cloud_zilliz" ]; then
|
| 36 |
+
echo "Removing Milvus Lite (using cloud Zilliz)..."
|
| 37 |
+
pip uninstall -y milvus-lite 2>/dev/null || echo " (already removed or not installed)"
|
| 38 |
+
fi
|
| 39 |
+
|
| 40 |
+
# Chainlit (if using Flask only)
|
| 41 |
+
if [ "$APP_TYPE" = "flask_graph" ] || [ "$APP_TYPE" = "flask" ]; then
|
| 42 |
+
echo "Removing Chainlit (using Flask app)..."
|
| 43 |
+
pip uninstall -y chainlit 2>/dev/null || echo " (already removed or not installed)"
|
| 44 |
+
fi
|
| 45 |
+
|
| 46 |
+
echo ""
|
| 47 |
+
echo "============================================"
|
| 48 |
+
echo "Cleanup Complete!"
|
| 49 |
+
echo "============================================"
|
| 50 |
+
echo "Before cleanup: ~800 MB"
|
| 51 |
+
echo "After cleanup: ~300-450 MB (depending on config)"
|
| 52 |
+
echo ""
|
| 53 |
+
echo "Note: If you redeploy and AUTO_RUN_PIPELINE=true,"
|
| 54 |
+
echo " all packages will be reinstalled automatically."
|
| 55 |
+
echo "============================================"
|
docker-compose.cloud.yml
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version: '3.8'
|
| 2 |
+
|
| 3 |
+
services:
|
| 4 |
+
allycat-graphrag-cloud:
|
| 5 |
+
image: allycat-graphrag:cloud
|
| 6 |
+
build:
|
| 7 |
+
context: .
|
| 8 |
+
dockerfile: Dockerfile
|
| 9 |
+
args:
|
| 10 |
+
INSTALL_OLLAMA: "false"
|
| 11 |
+
INSTALL_LOCAL_VECTOR_DB: "false"
|
| 12 |
+
container_name: allycat-cloud
|
| 13 |
+
ports:
|
| 14 |
+
- "${DOCKER_PORT:-8080}:${DOCKER_APP_PORT:-8080}"
|
| 15 |
+
environment:
|
| 16 |
+
- LLM_RUN_ENV=cloud
|
| 17 |
+
- VECTOR_DB_TYPE=cloud_zilliz
|
| 18 |
+
- APP_TYPE=${APP_TYPE:-flask_graph}
|
| 19 |
+
- AUTO_RUN_PIPELINE=${AUTO_RUN_PIPELINE:-false}
|
| 20 |
+
- DOCKER_APP_PORT=${DOCKER_APP_PORT:-8080}
|
| 21 |
+
- FLASK_GRAPH_PORT=${FLASK_GRAPH_PORT:-8080}
|
| 22 |
+
- FLASK_VECTOR_PORT=${FLASK_VECTOR_PORT:-8081}
|
| 23 |
+
- CHAINLIT_GRAPH_PORT=${CHAINLIT_GRAPH_PORT:-8083}
|
| 24 |
+
- CHAINLIT_VECTOR_PORT=${CHAINLIT_VECTOR_PORT:-8082}
|
| 25 |
+
env_file:
|
| 26 |
+
- .env
|
| 27 |
+
command: ["deploy"]
|
| 28 |
+
restart: unless-stopped
|
| 29 |
+
healthcheck:
|
| 30 |
+
test: ["CMD", "curl", "-f", "http://localhost:${DOCKER_APP_PORT:-8080}"]
|
| 31 |
+
interval: 60s
|
| 32 |
+
timeout: 60s
|
| 33 |
+
retries: 1
|
| 34 |
+
start_period: 1500s
|
docker-compose.hybrid.yml
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version: '3.8'
|
| 2 |
+
|
| 3 |
+
services:
|
| 4 |
+
allycat-graphrag-hybrid:
|
| 5 |
+
image: allycat-graphrag:hybrid
|
| 6 |
+
build:
|
| 7 |
+
context: .
|
| 8 |
+
dockerfile: Dockerfile
|
| 9 |
+
args:
|
| 10 |
+
INSTALL_OLLAMA: "false"
|
| 11 |
+
INSTALL_LOCAL_VECTOR_DB: "true"
|
| 12 |
+
container_name: allycat-hybrid
|
| 13 |
+
ports:
|
| 14 |
+
- "${DOCKER_PORT:-8080}:${DOCKER_APP_PORT:-8080}"
|
| 15 |
+
environment:
|
| 16 |
+
- LLM_RUN_ENV=cloud
|
| 17 |
+
- VECTOR_DB_TYPE=local
|
| 18 |
+
- APP_TYPE=${APP_TYPE:-flask_graph}
|
| 19 |
+
- AUTO_RUN_PIPELINE=${AUTO_RUN_PIPELINE:-false}
|
| 20 |
+
- DOCKER_APP_PORT=${DOCKER_APP_PORT:-8080}
|
| 21 |
+
- FLASK_GRAPH_PORT=${FLASK_GRAPH_PORT:-8080}
|
| 22 |
+
- FLASK_VECTOR_PORT=${FLASK_VECTOR_PORT:-8081}
|
| 23 |
+
- CHAINLIT_GRAPH_PORT=${CHAINLIT_GRAPH_PORT:-8083}
|
| 24 |
+
- CHAINLIT_VECTOR_PORT=${CHAINLIT_VECTOR_PORT:-8082}
|
| 25 |
+
env_file:
|
| 26 |
+
- .env
|
| 27 |
+
volumes:
|
| 28 |
+
- ./workspace:/allycat/workspace
|
| 29 |
+
command: ["deploy"]
|
| 30 |
+
restart: unless-stopped
|
| 31 |
+
healthcheck:
|
| 32 |
+
test: ["CMD", "curl", "-f", "http://localhost:${DOCKER_APP_PORT:-8080}"]
|
| 33 |
+
interval: 60s
|
| 34 |
+
timeout: 60s
|
| 35 |
+
retries: 1
|
| 36 |
+
start_period: 1500s
|
docker-compose.local.yml
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version: '3.8'
|
| 2 |
+
|
| 3 |
+
services:
|
| 4 |
+
allycat-graphrag-local:
|
| 5 |
+
image: allycat-graphrag:local
|
| 6 |
+
build:
|
| 7 |
+
context: .
|
| 8 |
+
dockerfile: Dockerfile
|
| 9 |
+
args:
|
| 10 |
+
INSTALL_OLLAMA: "true"
|
| 11 |
+
INSTALL_LOCAL_VECTOR_DB: "true"
|
| 12 |
+
container_name: allycat-local
|
| 13 |
+
ports:
|
| 14 |
+
- "${DOCKER_PORT:-8080}:${DOCKER_APP_PORT:-8080}"
|
| 15 |
+
- "${OLLAMA_PORT:-11434}:11434"
|
| 16 |
+
environment:
|
| 17 |
+
- LLM_RUN_ENV=local_ollama
|
| 18 |
+
- VECTOR_DB_TYPE=local
|
| 19 |
+
- APP_TYPE=${APP_TYPE:-flask_graph}
|
| 20 |
+
- AUTO_RUN_PIPELINE=${AUTO_RUN_PIPELINE:-false}
|
| 21 |
+
- DOCKER_APP_PORT=${DOCKER_APP_PORT:-8080}
|
| 22 |
+
- FLASK_GRAPH_PORT=${FLASK_GRAPH_PORT:-8080}
|
| 23 |
+
- FLASK_VECTOR_PORT=${FLASK_VECTOR_PORT:-8081}
|
| 24 |
+
- CHAINLIT_GRAPH_PORT=${CHAINLIT_GRAPH_PORT:-8083}
|
| 25 |
+
- CHAINLIT_VECTOR_PORT=${CHAINLIT_VECTOR_PORT:-8082}
|
| 26 |
+
- OLLAMA_PORT=${OLLAMA_PORT:-11434}
|
| 27 |
+
env_file:
|
| 28 |
+
- .env
|
| 29 |
+
volumes:
|
| 30 |
+
- ./workspace:/allycat/workspace
|
| 31 |
+
command: ["deploy"]
|
| 32 |
+
restart: unless-stopped
|
| 33 |
+
healthcheck:
|
| 34 |
+
test: ["CMD", "curl", "-f", "http://localhost:${DOCKER_APP_PORT:-8080}"]
|
| 35 |
+
interval: 60s
|
| 36 |
+
timeout: 60s
|
| 37 |
+
retries: 1
|
| 38 |
+
start_period: 1500s
|
docker-startup.sh
ADDED
|
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
echo "=== AllyCAT GraphRAG Docker Startup ==="
|
| 4 |
+
|
| 5 |
+
# Check deployment mode from environment
|
| 6 |
+
LLM_MODE=${LLM_RUN_ENV:-cloud}
|
| 7 |
+
VECTOR_MODE=${VECTOR_DB_TYPE:-cloud_zilliz}
|
| 8 |
+
|
| 9 |
+
echo "LLM Mode: $LLM_MODE"
|
| 10 |
+
echo "Vector DB Mode: $VECTOR_MODE"
|
| 11 |
+
|
| 12 |
+
# Conditional: Start Ollama only if in local mode
|
| 13 |
+
if [ "$LLM_MODE" = "local_ollama" ]; then
|
| 14 |
+
echo "Starting Ollama in local mode..."
|
| 15 |
+
|
| 16 |
+
# Define OLLAMA_MODELS dir
|
| 17 |
+
if [ -z "$OLLAMA_MODELS" ]; then
|
| 18 |
+
export OLLAMA_MODELS=/allycat/workspace/ollama
|
| 19 |
+
fi
|
| 20 |
+
|
| 21 |
+
echo "Env variables for OLLAMA:"
|
| 22 |
+
env | grep OLLAMA
|
| 23 |
+
|
| 24 |
+
# Start ollama
|
| 25 |
+
ollama_model=${OLLAMA_MODEL:-gemma3:1b}
|
| 26 |
+
echo "Starting Ollama server..."
|
| 27 |
+
ollama serve > /allycat/ollama-serve.out 2>&1 &
|
| 28 |
+
|
| 29 |
+
# Wait for ollama to start
|
| 30 |
+
OLLAMA_PORT=${OLLAMA_PORT:-11434}
|
| 31 |
+
while ! nc -z localhost $OLLAMA_PORT; do
|
| 32 |
+
sleep 1
|
| 33 |
+
done
|
| 34 |
+
echo "✅ Ollama started on port $OLLAMA_PORT"
|
| 35 |
+
|
| 36 |
+
# Only download the model if we are in DEPLOY mode
|
| 37 |
+
if [ "$1" == "deploy" ]; then
|
| 38 |
+
echo "Downloading Ollama model: $ollama_model"
|
| 39 |
+
ollama pull $ollama_model
|
| 40 |
+
echo "✅ Ollama model downloaded: $ollama_model"
|
| 41 |
+
fi
|
| 42 |
+
else
|
| 43 |
+
echo "✅ Using cloud LLM mode - Ollama not started"
|
| 44 |
+
fi
|
| 45 |
+
|
| 46 |
+
# Conditional: Setup local vector DB only if needed
|
| 47 |
+
if [ "$VECTOR_MODE" = "local" ]; then
|
| 48 |
+
echo "Setting up local Milvus vector database..."
|
| 49 |
+
mkdir -p /allycat/workspace
|
| 50 |
+
echo "✅ Local vector database directory created"
|
| 51 |
+
else
|
| 52 |
+
echo "✅ Using Zilliz Cloud for vector database"
|
| 53 |
+
fi
|
| 54 |
+
|
| 55 |
+
# Run GraphRAG pipeline if AUTO_RUN_PIPELINE is enabled and in deploy mode
|
| 56 |
+
if [ "$1" == "deploy" ] && [ "${AUTO_RUN_PIPELINE:-false}" = "true" ]; then
|
| 57 |
+
echo ""
|
| 58 |
+
echo "=== Running GraphRAG Pipeline Automatically ==="
|
| 59 |
+
echo ""
|
| 60 |
+
|
| 61 |
+
# Step 1: Crawl website
|
| 62 |
+
if [ -n "$WEBSITE_URL" ]; then
|
| 63 |
+
echo "Step 1/5: Crawling website: $WEBSITE_URL"
|
| 64 |
+
python3 1_crawl_site.py || echo "⚠️ Warning: Crawl failed, continuing..."
|
| 65 |
+
echo "✅ Step 1 complete"
|
| 66 |
+
echo ""
|
| 67 |
+
else
|
| 68 |
+
echo "⚠️ Skipping crawl - WEBSITE_URL not set"
|
| 69 |
+
fi
|
| 70 |
+
|
| 71 |
+
# Step 2: Process files to markdown
|
| 72 |
+
echo "Step 2/5: Processing files to markdown..."
|
| 73 |
+
python3 2_process_files.py || echo "⚠️ Warning: Processing failed, continuing..."
|
| 74 |
+
echo "✅ Step 2 complete"
|
| 75 |
+
echo ""
|
| 76 |
+
|
| 77 |
+
# Step 3: Save to vector database
|
| 78 |
+
echo "Step 3/5: Saving to vector database..."
|
| 79 |
+
if [ "$VECTOR_MODE" = "cloud_zilliz" ]; then
|
| 80 |
+
python3 3_save_to_vector_db_zilliz.py || echo "⚠️ Warning: Vector DB save failed, continuing..."
|
| 81 |
+
else
|
| 82 |
+
python3 3_save_to_vector_db.py || echo "⚠️ Warning: Vector DB save failed, continuing..."
|
| 83 |
+
fi
|
| 84 |
+
echo "✅ Step 3 complete"
|
| 85 |
+
echo ""
|
| 86 |
+
|
| 87 |
+
# Step 4: Process graph data (3 phases)
|
| 88 |
+
echo "Step 4/5: Processing graph data (3 phases)..."
|
| 89 |
+
echo " Phase 1: Extracting entities and relationships..."
|
| 90 |
+
python3 2b_process_graph_phase1.py || echo "⚠️ Warning: Phase 1 failed, continuing..."
|
| 91 |
+
echo " Phase 2: Building communities..."
|
| 92 |
+
python3 2b_process_graph_phase2.py || echo "⚠️ Warning: Phase 2 failed, continuing..."
|
| 93 |
+
echo " Phase 3: Generating community summaries..."
|
| 94 |
+
python3 2b_process_graph_phase3.py || echo "⚠️ Warning: Phase 3 failed, continuing..."
|
| 95 |
+
echo "✅ Step 4 complete"
|
| 96 |
+
echo ""
|
| 97 |
+
|
| 98 |
+
# Step 5: Save to graph database
|
| 99 |
+
echo "Step 5/5: Saving to graph database..."
|
| 100 |
+
python3 3b_save_to_graph_db.py || echo "⚠️ Warning: Graph DB save failed, continuing..."
|
| 101 |
+
echo "✅ Step 5 complete"
|
| 102 |
+
echo ""
|
| 103 |
+
|
| 104 |
+
echo "=== ✅ Pipeline Complete - Starting Application ==="
|
| 105 |
+
echo ""
|
| 106 |
+
|
| 107 |
+
# OPTIMIZATION: Clean up pipeline dependencies to save RAM
|
| 108 |
+
if [ "${CLEANUP_PIPELINE_DEPS:-false}" = "true" ]; then
|
| 109 |
+
echo ""
|
| 110 |
+
echo "=== 🧹 Cleaning Up Pipeline Dependencies ==="
|
| 111 |
+
echo "This will save ~350-500 MB of RAM"
|
| 112 |
+
echo ""
|
| 113 |
+
chmod +x ./cleanup_pipeline_deps.sh
|
| 114 |
+
./cleanup_pipeline_deps.sh
|
| 115 |
+
echo ""
|
| 116 |
+
echo "=== ✅ Cleanup Complete ==="
|
| 117 |
+
echo ""
|
| 118 |
+
else
|
| 119 |
+
echo ""
|
| 120 |
+
echo "💡 TIP: Set CLEANUP_PIPELINE_DEPS=true in .env to save ~350-500 MB RAM"
|
| 121 |
+
echo " after pipeline completes (reduces OOM errors on 1GB containers)"
|
| 122 |
+
echo ""
|
| 123 |
+
fi
|
| 124 |
+
fi
|
| 125 |
+
|
| 126 |
+
# Start the appropriate web application
|
| 127 |
+
APP_TYPE=${APP_TYPE:-flask_graph}
|
| 128 |
+
DOCKER_APP_PORT=${DOCKER_APP_PORT:-8080}
|
| 129 |
+
FLASK_GRAPH_PORT=${FLASK_GRAPH_PORT:-8080}
|
| 130 |
+
FLASK_VECTOR_PORT=${FLASK_VECTOR_PORT:-8081}
|
| 131 |
+
CHAINLIT_GRAPH_PORT=${CHAINLIT_GRAPH_PORT:-8083}
|
| 132 |
+
CHAINLIT_VECTOR_PORT=${CHAINLIT_VECTOR_PORT:-8082}
|
| 133 |
+
|
| 134 |
+
# Log port configuration
|
| 135 |
+
echo ""
|
| 136 |
+
echo "=== Port Configuration ==="
|
| 137 |
+
echo "DOCKER_APP_PORT (internal container): $DOCKER_APP_PORT"
|
| 138 |
+
echo "FLASK_GRAPH_PORT: $FLASK_GRAPH_PORT"
|
| 139 |
+
echo "FLASK_VECTOR_PORT: $FLASK_VECTOR_PORT"
|
| 140 |
+
echo "CHAINLIT_GRAPH_PORT: $CHAINLIT_GRAPH_PORT"
|
| 141 |
+
echo "CHAINLIT_VECTOR_PORT: $CHAINLIT_VECTOR_PORT"
|
| 142 |
+
echo ""
|
| 143 |
+
|
| 144 |
+
# Determine which port will be used based on APP_TYPE
|
| 145 |
+
case $APP_TYPE in
|
| 146 |
+
"flask_graph")
|
| 147 |
+
APP_PORT=$FLASK_GRAPH_PORT
|
| 148 |
+
;;
|
| 149 |
+
"chainlit_graph")
|
| 150 |
+
APP_PORT=$CHAINLIT_GRAPH_PORT
|
| 151 |
+
;;
|
| 152 |
+
"flask")
|
| 153 |
+
APP_PORT=$FLASK_VECTOR_PORT
|
| 154 |
+
;;
|
| 155 |
+
"chainlit")
|
| 156 |
+
APP_PORT=$CHAINLIT_VECTOR_PORT
|
| 157 |
+
;;
|
| 158 |
+
*)
|
| 159 |
+
APP_PORT=$FLASK_GRAPH_PORT
|
| 160 |
+
;;
|
| 161 |
+
esac
|
| 162 |
+
|
| 163 |
+
echo "Selected APP_TYPE: $APP_TYPE will run on port: $APP_PORT"
|
| 164 |
+
echo "Container will expose application on port: $DOCKER_APP_PORT (mapped to host DOCKER_PORT)"
|
| 165 |
+
echo ""
|
| 166 |
+
|
| 167 |
+
if [ "$1" == "deploy" ]; then
|
| 168 |
+
echo "In deploy mode..."
|
| 169 |
+
|
| 170 |
+
case $APP_TYPE in
|
| 171 |
+
"flask_graph")
|
| 172 |
+
echo "Starting Flask GraphRAG app on port $FLASK_GRAPH_PORT..."
|
| 173 |
+
python3 app_flask_graph.py
|
| 174 |
+
;;
|
| 175 |
+
"chainlit_graph")
|
| 176 |
+
echo "Starting Chainlit GraphRAG app on port $CHAINLIT_GRAPH_PORT..."
|
| 177 |
+
chainlit run app_chainlit_graph.py --host 0.0.0.0 --port $CHAINLIT_GRAPH_PORT
|
| 178 |
+
;;
|
| 179 |
+
"flask")
|
| 180 |
+
echo "Starting Flask Vector RAG app on port $FLASK_VECTOR_PORT..."
|
| 181 |
+
python3 app_flask.py
|
| 182 |
+
;;
|
| 183 |
+
"chainlit")
|
| 184 |
+
echo "Starting Chainlit Vector RAG app on port $CHAINLIT_VECTOR_PORT..."
|
| 185 |
+
chainlit run app_chainlit.py --host 0.0.0.0 --port $CHAINLIT_VECTOR_PORT
|
| 186 |
+
;;
|
| 187 |
+
*)
|
| 188 |
+
echo "Starting default Flask GraphRAG app on port $FLASK_GRAPH_PORT..."
|
| 189 |
+
python3 app_flask_graph.py
|
| 190 |
+
;;
|
| 191 |
+
esac
|
| 192 |
+
else
|
| 193 |
+
echo "Not in deploy mode, entering interactive shell."
|
| 194 |
+
echo ""
|
| 195 |
+
echo "Available commands:"
|
| 196 |
+
echo " python3 app_flask_graph.py - Start Flask GraphRAG app"
|
| 197 |
+
echo " python3 app_flask.py - Start Flask VectorRAG app"
|
| 198 |
+
echo " chainlit run app_chainlit_graph.py - Start Chainlit GraphRAG app"
|
| 199 |
+
echo " chainlit run app_chainlit.py - Start Chainlit VectorRAG app"
|
| 200 |
+
|
| 201 |
+
if [ "$LLM_MODE" = "local_ollama" ]; then
|
| 202 |
+
echo " ollama pull $ollama_model - Download Ollama model"
|
| 203 |
+
fi
|
| 204 |
+
echo ""
|
| 205 |
+
/bin/bash
|
| 206 |
+
fi
|
env.sample.txt
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ============================================
|
| 2 |
+
# AllyCAT GraphRAG Configuration
|
| 3 |
+
# ============================================
|
| 4 |
+
# This file contains all configuration options for AllyCAT GraphRAG.
|
| 5 |
+
# Copy this file to .env and customize the values.
|
| 6 |
+
|
| 7 |
+
# ============================================
|
| 8 |
+
# Deployment Mode
|
| 9 |
+
# ============================================
|
| 10 |
+
# Automatically run the complete pipeline on startup (Docker deployments)
|
| 11 |
+
# Set to true for Heroku, AWS, Google Cloud Run, etc.
|
| 12 |
+
AUTO_RUN_PIPELINE=false
|
| 13 |
+
|
| 14 |
+
# Memory Optimization: Remove pipeline dependencies after completion
|
| 15 |
+
# Saves ~350-500 MB RAM - recommended for 1GB containers (DigitalOcean, etc.)
|
| 16 |
+
# Set to true to automatically clean up heavy packages after pipeline completes
|
| 17 |
+
CLEANUP_PIPELINE_DEPS=false
|
| 18 |
+
|
| 19 |
+
# ============================================
|
| 20 |
+
# Website Crawling Configuration
|
| 21 |
+
# ============================================
|
| 22 |
+
# Website to crawl (required if AUTO_RUN_PIPELINE=true)
|
| 23 |
+
WEBSITE_URL=https://example.com
|
| 24 |
+
CRAWL_MAX_DOWNLOADS=100
|
| 25 |
+
CRAWL_MAX_DEPTH=3
|
| 26 |
+
WAITTIME_BETWEEN_REQUESTS=0.1
|
| 27 |
+
|
| 28 |
+
# ============================================
|
| 29 |
+
# LLM Configuration (Cloud-First)
|
| 30 |
+
# ============================================
|
| 31 |
+
# LLM Runtime Environment
|
| 32 |
+
# Options: cloud, local_ollama
|
| 33 |
+
LLM_RUN_ENV=cloud
|
| 34 |
+
|
| 35 |
+
# LLM Model Selection
|
| 36 |
+
# Cloud providers: cerebras/llama3.1-8b, gemini/gemini-1.5-flash, nebius/meta-llama/Meta-Llama-3.1-8B-Instruct
|
| 37 |
+
# Local: ollama/gemma3:1b
|
| 38 |
+
LLM_MODEL=cerebras/llama3.1-8b
|
| 39 |
+
|
| 40 |
+
# ============================================
|
| 41 |
+
# LLM API Keys (Cloud Providers)
|
| 42 |
+
# ============================================
|
| 43 |
+
# Get your FREE API keys:
|
| 44 |
+
# - Cerebras: https://cerebras.ai/ (recommended)
|
| 45 |
+
# - Gemini: https://aistudio.google.com/
|
| 46 |
+
# - Nebius: https://studio.nebius.ai/
|
| 47 |
+
|
| 48 |
+
CEREBRAS_API_KEY=your_cerebras_api_key
|
| 49 |
+
GEMINI_API_KEY=your_gemini_api_key
|
| 50 |
+
NEBIUS_API_KEY=your_nebius_api_key
|
| 51 |
+
|
| 52 |
+
# ============================================
|
| 53 |
+
# Local Ollama Configuration (Optional)
|
| 54 |
+
# ============================================
|
| 55 |
+
# Only needed if LLM_RUN_ENV=local_ollama
|
| 56 |
+
# OLLAMA_MODEL=gemma3:1b
|
| 57 |
+
# OLLAMA_BASE_URL=http://localhost:11434
|
| 58 |
+
|
| 59 |
+
# ============================================
|
| 60 |
+
# Vector Database Configuration
|
| 61 |
+
# ============================================
|
| 62 |
+
# Options: cloud_zilliz (recommended), local
|
| 63 |
+
VECTOR_DB_TYPE=cloud_zilliz
|
| 64 |
+
|
| 65 |
+
# Zilliz Cloud Configuration (https://cloud.zilliz.com/)
|
| 66 |
+
ZILLIZ_CLUSTER_ENDPOINT=https://your-cluster.zilliz.cloud
|
| 67 |
+
ZILLIZ_TOKEN=your_zilliz_token
|
| 68 |
+
|
| 69 |
+
# Local Milvus Configuration (only if VECTOR_DB_TYPE=local)
|
| 70 |
+
# MILVUS_URI=./workspace/milvus_lite.db
|
| 71 |
+
|
| 72 |
+
# ============================================
|
| 73 |
+
# Graph Database Configuration (Neo4j)
|
| 74 |
+
# ============================================
|
| 75 |
+
# Neo4j Aura (Cloud) - Recommended: https://neo4j.com/cloud/aura/
|
| 76 |
+
NEO4J_URI=neo4j+s://your-instance.databases.neo4j.io
|
| 77 |
+
NEO4J_USERNAME=neo4j
|
| 78 |
+
NEO4J_PASSWORD=your_neo4j_password
|
| 79 |
+
NEO4J_DATABASE=neo4j
|
| 80 |
+
|
| 81 |
+
# Local Neo4j (only for development)
|
| 82 |
+
# NEO4J_URI=bolt://localhost:7687
|
| 83 |
+
|
| 84 |
+
# ============================================
|
| 85 |
+
# Graph Extraction LLM Provider
|
| 86 |
+
# ============================================
|
| 87 |
+
# Provider for entity/relationship extraction
|
| 88 |
+
# Options: gemini (recommended, 1500 free requests/day), cerebras
|
| 89 |
+
GRAPH_LLM_PROVIDER=gemini
|
| 90 |
+
|
| 91 |
+
# API keys are shared from LLM Configuration section above
|
| 92 |
+
|
| 93 |
+
# ============================================
|
| 94 |
+
# Embedding Model Configuration
|
| 95 |
+
# ============================================
|
| 96 |
+
# Embedding model for semantic search
|
| 97 |
+
# Options:
|
| 98 |
+
# - ibm-granite/granite-embedding-30m-english (61 MB, fastest)
|
| 99 |
+
# - BAAI/bge-small-en-v1.5 (129 MB, balanced)
|
| 100 |
+
# - ibm-granite/granite-embedding-107m-multilingual (219 MB, multilingual)
|
| 101 |
+
EMBEDDING_MODEL=ibm-granite/granite-embedding-30m-english
|
| 102 |
+
EMBEDDING_LENGTH=384
|
| 103 |
+
|
| 104 |
+
# ============================================
|
| 105 |
+
# Chunking Configuration
|
| 106 |
+
# ============================================
|
| 107 |
+
CHUNK_SIZE=512
|
| 108 |
+
CHUNK_OVERLAP=20
|
| 109 |
+
|
| 110 |
+
# ============================================
|
| 111 |
+
# Graph Extraction Configuration
|
| 112 |
+
# ============================================
|
| 113 |
+
# Entity and relationship extraction parameters
|
| 114 |
+
GRAPH_MIN_ENTITIES=5
|
| 115 |
+
GRAPH_MAX_ENTITIES=15
|
| 116 |
+
GRAPH_MIN_RELATIONSHIPS=3
|
| 117 |
+
GRAPH_MAX_RELATIONSHIPS=8
|
| 118 |
+
GRAPH_MIN_CONFIDENCE=0.8
|
| 119 |
+
GRAPH_MAX_CONTENT_CHARS=12000
|
| 120 |
+
GRAPH_SENTENCE_BOUNDARY_RATIO=0.7
|
| 121 |
+
|
| 122 |
+
# ============================================
|
| 123 |
+
# Graph Community Detection (Phase 2)
|
| 124 |
+
# ============================================
|
| 125 |
+
# Leiden algorithm parameters for community detection
|
| 126 |
+
GRAPH_MIN_COMMUNITY_SIZE=5
|
| 127 |
+
GRAPH_LEIDEN_RESOLUTION=1.0
|
| 128 |
+
GRAPH_LEIDEN_ITERATIONS=-1
|
| 129 |
+
GRAPH_LEIDEN_SEED=42
|
| 130 |
+
GRAPH_TARGET_COVERAGE_MIN=5.0
|
| 131 |
+
GRAPH_TARGET_COVERAGE_MAX=8.0
|
| 132 |
+
GRAPH_RESOLUTION_CANDIDATES=0.1,0.5,1.0,2.0,5.0,10.0,20.0,30.0,50.0,100.0
|
| 133 |
+
GRAPH_MIN_NODES_FOR_OPTIMIZATION=50
|
| 134 |
+
|
| 135 |
+
# ============================================
|
| 136 |
+
# Application Configuration
|
| 137 |
+
# ============================================
|
| 138 |
+
# Application type for Docker deployment
|
| 139 |
+
# Options: flask_graph (default), chainlit_graph, flask
|
| 140 |
+
APP_TYPE=flask_graph
|
| 141 |
+
|
| 142 |
+
# Flask server port
|
| 143 |
+
PORT=8080
|
| 144 |
+
|
| 145 |
+
# UI starter prompts (pipe-separated)
|
| 146 |
+
UI_STARTER_PROMPTS=What is this website? | What are upcoming events? | Who are some of the partners?
|
| 147 |
+
|
| 148 |
+
# ============================================
|
| 149 |
+
# Port Configuration
|
| 150 |
+
# ============================================
|
| 151 |
+
# Flask apps (Vector RAG vs GraphRAG) - Auto-configured via MY_CONFIG
|
| 152 |
+
FLASK_VECTOR_PORT=8081 # app_flask.py (vector-only RAG)
|
| 153 |
+
FLASK_GRAPH_PORT=8080 # app_flask_graph.py (GraphRAG)
|
| 154 |
+
|
| 155 |
+
# Chainlit apps (interactive UI) - Default port: 8000, custom ports for Docker
|
| 156 |
+
CHAINLIT_VECTOR_PORT=8082 # app_chainlit.py (Docker only; native Python uses 8000)
|
| 157 |
+
CHAINLIT_GRAPH_PORT=8083 # app_chainlit_graph.py (Docker only; native Python uses 8000)
|
| 158 |
+
|
| 159 |
+
# Docker and external services
|
| 160 |
+
DOCKER_PORT=8080 # External Docker exposed port (host side)
|
| 161 |
+
DOCKER_APP_PORT=8080 # Internal container port (container side, matches APP_TYPE)
|
| 162 |
+
OLLAMA_PORT=11434 # Ollama server port (for local LLM)
|
| 163 |
+
|
| 164 |
+
# ============================================
|
| 165 |
+
# Workspace Configuration
|
| 166 |
+
# ============================================
|
| 167 |
+
# For native execution: use relative path 'workspace'
|
| 168 |
+
# For Docker: use absolute path '/allycat/workspace'
|
| 169 |
+
WORKSPACE_DIR=workspace
|
| 170 |
+
|
| 171 |
+
# ============================================
|
| 172 |
+
# Advanced Configuration
|
| 173 |
+
# ============================================
|
| 174 |
+
# Hugging Face endpoint (for Chinese users or custom mirrors)
|
| 175 |
+
HF_ENDPOINT=https://huggingface.co
|
| 176 |
+
|
| 177 |
+
|
file_utils.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import requests
|
| 3 |
+
from humanfriendly import format_size
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import glob
|
| 6 |
+
from urllib.parse import unquote
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
## Reads parquet files in a folder into a pandas dataframe
|
| 10 |
+
def read_parquet_files_as_df (parquet_dir):
|
| 11 |
+
parquet_files = glob.glob(f'{parquet_dir}/*.parquet')
|
| 12 |
+
|
| 13 |
+
# read each parquet file into a DataFrame and store in a list
|
| 14 |
+
dfs = [pd.read_parquet (f) for f in parquet_files]
|
| 15 |
+
|
| 16 |
+
# Concatenate all DataFrames into a single DataFrame
|
| 17 |
+
data_df = pd.concat(dfs, ignore_index=True)
|
| 18 |
+
return data_df
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def download_file(url, local_file, chunk_size=1024*1024):
|
| 22 |
+
"""
|
| 23 |
+
Downloads a remote URL to a local file.
|
| 24 |
+
|
| 25 |
+
Args:
|
| 26 |
+
url (str): The remote URL.
|
| 27 |
+
local_filename (str): The name of the local file to save the downloaded content.
|
| 28 |
+
chunk_size (int): The size in bytes of each chunk. Defaults to 1024.
|
| 29 |
+
|
| 30 |
+
Returns:
|
| 31 |
+
None
|
| 32 |
+
|
| 33 |
+
Example usage:
|
| 34 |
+
download_file('http://example.com/file.txt', 'file.txt', chunk_size=1024*1024) # Download in chunks of 1MB
|
| 35 |
+
"""
|
| 36 |
+
# Check if the local file already exists
|
| 37 |
+
if os.path.exists(local_file):
|
| 38 |
+
file_size = format_size(os.path.getsize(local_file))
|
| 39 |
+
print(f"Local file '{local_file}' ({file_size}) already exists. Skipping download.")
|
| 40 |
+
return
|
| 41 |
+
|
| 42 |
+
# Create the directory if it doesn't exist
|
| 43 |
+
os.makedirs(os.path.dirname(local_file), exist_ok=True)
|
| 44 |
+
|
| 45 |
+
# Stream the file download
|
| 46 |
+
with requests.get(url, stream=True) as r:
|
| 47 |
+
r.raise_for_status()
|
| 48 |
+
with open(local_file, 'wb') as f:
|
| 49 |
+
for chunk in r.iter_content(chunk_size=chunk_size):
|
| 50 |
+
if chunk: # filter out keep-alive new chunks
|
| 51 |
+
f.write(chunk)
|
| 52 |
+
print()
|
| 53 |
+
file_size = format_size(os.path.getsize(local_file))
|
| 54 |
+
print(f"{local_file} ({file_size}) downloaded successfully.")
|
| 55 |
+
## --- end: download_file ------
|
| 56 |
+
|
litellm_patch.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
LiteLLM Async Task Cleanup Patch
|
| 3 |
+
|
| 4 |
+
This module patches LiteLLM's asynchronous logging to ensure all tasks complete properly
|
| 5 |
+
and prevent "Task was destroyed but it is pending!" errors.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import asyncio
|
| 9 |
+
import logging
|
| 10 |
+
import functools
|
| 11 |
+
import inspect
|
| 12 |
+
import sys
|
| 13 |
+
from typing import Any, Callable, Coroutine
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
# Global registry of pending LiteLLM async tasks
|
| 18 |
+
_pending_tasks = set()
|
| 19 |
+
|
| 20 |
+
def _patch_litellm_async_logging():
|
| 21 |
+
"""
|
| 22 |
+
Patches LiteLLM async logging functions to ensure proper task cleanup.
|
| 23 |
+
Prevents "Task was destroyed but it is pending!" errors.
|
| 24 |
+
"""
|
| 25 |
+
try:
|
| 26 |
+
# Try to import LiteLLM modules
|
| 27 |
+
import litellm
|
| 28 |
+
from litellm.utils import _client_async_logging_helper
|
| 29 |
+
|
| 30 |
+
# Store original function
|
| 31 |
+
original_client_async_logging = _client_async_logging_helper
|
| 32 |
+
|
| 33 |
+
# Create patched version with error handling
|
| 34 |
+
@functools.wraps(original_client_async_logging)
|
| 35 |
+
async def patched_client_async_logging_helper(*args, **kwargs):
|
| 36 |
+
try:
|
| 37 |
+
return await original_client_async_logging(*args, **kwargs)
|
| 38 |
+
except Exception as e:
|
| 39 |
+
logger.warning(f"LiteLLM async logging error (handled): {e}")
|
| 40 |
+
return None
|
| 41 |
+
|
| 42 |
+
# Apply patch
|
| 43 |
+
litellm.utils._client_async_logging_helper = patched_client_async_logging_helper
|
| 44 |
+
|
| 45 |
+
# Patch Logging class async_success_handler if available
|
| 46 |
+
if hasattr(litellm, 'litellm_core_utils') and hasattr(litellm.litellm_core_utils, 'litellm_logging'):
|
| 47 |
+
from litellm.litellm_core_utils.litellm_logging import Logging
|
| 48 |
+
|
| 49 |
+
if hasattr(Logging, 'async_success_handler'):
|
| 50 |
+
original_async_success_handler = Logging.async_success_handler
|
| 51 |
+
|
| 52 |
+
@functools.wraps(original_async_success_handler)
|
| 53 |
+
async def patched_async_success_handler(*args, **kwargs):
|
| 54 |
+
try:
|
| 55 |
+
return await original_async_success_handler(*args, **kwargs)
|
| 56 |
+
except Exception as e:
|
| 57 |
+
logger.warning(f"LiteLLM async_success_handler error (handled): {e}")
|
| 58 |
+
return None
|
| 59 |
+
|
| 60 |
+
Logging.async_success_handler = patched_async_success_handler
|
| 61 |
+
|
| 62 |
+
logger.info("Successfully patched LiteLLM async logging functions")
|
| 63 |
+
return True
|
| 64 |
+
|
| 65 |
+
except ImportError:
|
| 66 |
+
logger.warning("Could not find LiteLLM modules to patch")
|
| 67 |
+
return False
|
| 68 |
+
except Exception as e:
|
| 69 |
+
logger.error(f"Error patching LiteLLM: {e}")
|
| 70 |
+
return False
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def create_task_with_cleanup(coro: Coroutine) -> asyncio.Task:
|
| 74 |
+
"""
|
| 75 |
+
Creates an asyncio task with automatic cleanup registration.
|
| 76 |
+
Prevents orphaned tasks and associated warnings.
|
| 77 |
+
"""
|
| 78 |
+
task = asyncio.create_task(coro)
|
| 79 |
+
_pending_tasks.add(task)
|
| 80 |
+
task.add_done_callback(_pending_tasks.discard)
|
| 81 |
+
return task
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
async def cleanup_all_async_tasks(timeout: float = 2.0):
|
| 85 |
+
"""
|
| 86 |
+
Waits for pending async tasks to complete within timeout period.
|
| 87 |
+
Should be called before exiting async contexts to prevent warnings.
|
| 88 |
+
"""
|
| 89 |
+
if not _pending_tasks:
|
| 90 |
+
return
|
| 91 |
+
|
| 92 |
+
logger.debug(f"Cleaning up {len(_pending_tasks)} pending async tasks...")
|
| 93 |
+
try:
|
| 94 |
+
# Wait for all pending tasks with a timeout
|
| 95 |
+
done, pending = await asyncio.wait(
|
| 96 |
+
_pending_tasks, timeout=timeout, return_when=asyncio.ALL_COMPLETED
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
if pending:
|
| 100 |
+
logger.warning(f"{len(pending)} async tasks still pending after timeout")
|
| 101 |
+
except Exception as e:
|
| 102 |
+
logger.error(f"Error during async task cleanup: {e}")
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
# Apply the patch when this module is imported
|
| 106 |
+
_patch_litellm_async_logging()
|
my_config.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
|
| 4 |
+
# Load environment variables from .env file
|
| 5 |
+
load_dotenv()
|
| 6 |
+
|
| 7 |
+
## Configuration
|
| 8 |
+
|
| 9 |
+
class MyConfig:
|
| 10 |
+
pass
|
| 11 |
+
|
| 12 |
+
MY_CONFIG = MyConfig ()
|
| 13 |
+
|
| 14 |
+
## All of these settings can be overridden by .env file
|
| 15 |
+
## And it will be loaded automatically by load_dotenv()
|
| 16 |
+
## And they will take precedence over the default values below
|
| 17 |
+
## See sample .env file 'env.sample.txt' for reference
|
| 18 |
+
|
| 19 |
+
## HuggingFace config
|
| 20 |
+
MY_CONFIG.HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://thealliance.ai/")
|
| 21 |
+
|
| 22 |
+
## Crawl settings
|
| 23 |
+
MY_CONFIG.WEBSITE_URL = os.getenv("WEBSITE_URL", "")
|
| 24 |
+
MY_CONFIG.CRAWL_MAX_DOWNLOADS = int(os.getenv("CRAWL_MAX_DOWNLOADS", 100))
|
| 25 |
+
MY_CONFIG.CRAWL_MAX_DEPTH = int(os.getenv("CRAWL_MAX_DEPTH", 3))
|
| 26 |
+
MY_CONFIG.WAITTIME_BETWEEN_REQUESTS = float(os.getenv("WAITTIME_BETWEEN_REQUESTS", 0.1)) # in seconds
|
| 27 |
+
MY_CONFIG.CRAWL_MIME_TYPE = 'text/html'
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
## Directories
|
| 31 |
+
MY_CONFIG.WORKSPACE_DIR = os.path.join(os.getenv('WORKSPACE_DIR', 'workspace'))
|
| 32 |
+
MY_CONFIG.CRAWL_DIR = os.path.join( MY_CONFIG.WORKSPACE_DIR, "crawled")
|
| 33 |
+
MY_CONFIG.PROCESSED_DATA_DIR = os.path.join( MY_CONFIG.WORKSPACE_DIR, "processed")
|
| 34 |
+
|
| 35 |
+
## llama index will download the models to this directory
|
| 36 |
+
os.environ["LLAMA_INDEX_CACHE_DIR"] = os.path.join(MY_CONFIG.WORKSPACE_DIR, "llama_index_cache")
|
| 37 |
+
### -------------------------------
|
| 38 |
+
|
| 39 |
+
# Find embedding models: https://huggingface.co/spaces/mteb/leaderboard
|
| 40 |
+
|
| 41 |
+
MY_CONFIG.EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", 'ibm-granite/granite-embedding-30m-english')
|
| 42 |
+
MY_CONFIG.EMBEDDING_LENGTH = int(os.getenv("EMBEDDING_LENGTH", 384))
|
| 43 |
+
|
| 44 |
+
## Chunking
|
| 45 |
+
MY_CONFIG.CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", 512))
|
| 46 |
+
MY_CONFIG.CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", 20))
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
### Milvus config
|
| 50 |
+
MY_CONFIG.COLLECTION_NAME = 'pages'
|
| 51 |
+
|
| 52 |
+
# Separate Milvus databases for different RAG approaches
|
| 53 |
+
# This allows running Vector RAG and Hybrid GraphRAG simultaneously without conflicts
|
| 54 |
+
MY_CONFIG.MILVUS_URI_VECTOR = os.path.join( MY_CONFIG.WORKSPACE_DIR, 'vector_only_milvus.db') # Vector RAG only
|
| 55 |
+
MY_CONFIG.MILVUS_URI_HYBRID_GRAPH = os.path.join( MY_CONFIG.WORKSPACE_DIR, 'hybrid_graph_milvus.db') # Hybrid GraphRAG
|
| 56 |
+
|
| 57 |
+
# Vector Database Configuration
|
| 58 |
+
MY_CONFIG.VECTOR_DB_TYPE = os.getenv("VECTOR_DB_TYPE", "cloud_zilliz") # Options: "local" or "cloud_zilliz"
|
| 59 |
+
|
| 60 |
+
# Zilliz Cloud Configuration (for cloud deployment)
|
| 61 |
+
MY_CONFIG.ZILLIZ_CLUSTER_ENDPOINT = os.getenv("ZILLIZ_CLUSTER_ENDPOINT")
|
| 62 |
+
MY_CONFIG.ZILLIZ_TOKEN = os.getenv("ZILLIZ_TOKEN")
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
## ---- LLM settings ----
|
| 67 |
+
## Choose one: We can do local or cloud LLMs
|
| 68 |
+
# LLM_RUN_ENV controls which LLM backend to use: 'local_ollama' for local Ollama, 'cloud' for cloud LLMs
|
| 69 |
+
# Set LLM_RUN_ENV in your .env file. Default is 'cloud' for production deployment.
|
| 70 |
+
## Local LLMs are run on your machine using Ollama
|
| 71 |
+
## Cloud LLMs are run on any LiteLLM supported service like Replicate / Nebius / Cerebras / etc
|
| 72 |
+
## For running Ollama locally, please check the instructions in the docs/llm-local.md file
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
MY_CONFIG.LLM_RUN_ENV = os.getenv("LLM_RUN_ENV", "cloud")
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
MY_CONFIG.LLM_MODEL = os.getenv("LLM_MODEL", 'cerebras/llama3.1-8b')
|
| 79 |
+
|
| 80 |
+
# Replicate API token (if using Replicate)
|
| 81 |
+
MY_CONFIG.REPLICATE_API_TOKEN = os.getenv("REPLICATE_API_TOKEN", None)
|
| 82 |
+
# Nebius API key (if using Nebius)
|
| 83 |
+
MY_CONFIG.NEBIUS_API_KEY = os.getenv("NEBIUS_API_KEY", None)
|
| 84 |
+
|
| 85 |
+
# --- GraphBuilder LLM API keys ---
|
| 86 |
+
MY_CONFIG.CEREBRAS_API_KEY = os.getenv("CEREBRAS_API_KEY", None)
|
| 87 |
+
MY_CONFIG.GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", None)
|
| 88 |
+
|
| 89 |
+
# --- Graph entity/relationship extraction config ---
|
| 90 |
+
MY_CONFIG.GRAPH_MIN_ENTITIES = int(os.getenv("GRAPH_MIN_ENTITIES", 5))
|
| 91 |
+
MY_CONFIG.GRAPH_MAX_ENTITIES = int(os.getenv("GRAPH_MAX_ENTITIES", 15))
|
| 92 |
+
MY_CONFIG.GRAPH_MIN_RELATIONSHIPS = int(os.getenv("GRAPH_MIN_RELATIONSHIPS", 3))
|
| 93 |
+
MY_CONFIG.GRAPH_MAX_RELATIONSHIPS = int(os.getenv("GRAPH_MAX_RELATIONSHIPS", 8))
|
| 94 |
+
MY_CONFIG.GRAPH_MIN_CONFIDENCE = float(os.getenv("GRAPH_MIN_CONFIDENCE", 0.8))
|
| 95 |
+
MY_CONFIG.GRAPH_MAX_CONTENT_CHARS = int(os.getenv("GRAPH_MAX_CONTENT_CHARS", 12000))
|
| 96 |
+
MY_CONFIG.GRAPH_SENTENCE_BOUNDARY_RATIO = float(os.getenv("GRAPH_SENTENCE_BOUNDARY_RATIO", 0.7))
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
## --- GraphRAG ---
|
| 101 |
+
# --- Neo4j config ---
|
| 102 |
+
MY_CONFIG.NEO4J_URI = os.getenv("NEO4J_URI")
|
| 103 |
+
MY_CONFIG.NEO4J_USER = os.getenv("NEO4J_USERNAME")
|
| 104 |
+
MY_CONFIG.NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
|
| 105 |
+
MY_CONFIG.NEO4J_DATABASE = os.getenv("NEO4J_DATABASE")
|
| 106 |
+
MY_CONFIG.GRAPH_DATA_DIR = os.path.join(MY_CONFIG.WORKSPACE_DIR, "graph_data")
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
## --- UI settings ---
|
| 113 |
+
MY_CONFIG.STARTER_PROMPTS_STR = os.getenv("UI_STARTER_PROMPTS", 'What is this website? | What are upcoming events? | Who are some of the partners?')
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
MY_CONFIG.STARTER_PROMPTS = MY_CONFIG.STARTER_PROMPTS_STR.split("|") if MY_CONFIG.STARTER_PROMPTS_STR else []
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
## --- Port Configuration ---
|
| 120 |
+
# Flask apps (auto-configured via MY_CONFIG)
|
| 121 |
+
MY_CONFIG.FLASK_VECTOR_PORT = int(os.getenv("FLASK_VECTOR_PORT", 8081)) # app_flask.py (vector RAG)
|
| 122 |
+
MY_CONFIG.FLASK_GRAPH_PORT = int(os.getenv("FLASK_GRAPH_PORT", 8080)) # app_flask_graph.py (GraphRAG)
|
| 123 |
+
|
| 124 |
+
# Chainlit apps (default port: 8000, custom ports for Docker deployments)
|
| 125 |
+
MY_CONFIG.CHAINLIT_VECTOR_PORT = int(os.getenv("CHAINLIT_VECTOR_PORT", 8082)) # app_chainlit.py (Docker: 8082, Native: 8000)
|
| 126 |
+
MY_CONFIG.CHAINLIT_GRAPH_PORT = int(os.getenv("CHAINLIT_GRAPH_PORT", 8083)) # app_chainlit_graph.py (Docker: 8083, Native: 8000)
|
| 127 |
+
|
| 128 |
+
# Docker and external services
|
| 129 |
+
MY_CONFIG.DOCKER_PORT = int(os.getenv("DOCKER_PORT", 8080)) # External host port (maps to DOCKER_APP_PORT)
|
| 130 |
+
MY_CONFIG.DOCKER_APP_PORT = int(os.getenv("DOCKER_APP_PORT", 8080)) # Internal container port (all apps use this in Docker)
|
| 131 |
+
MY_CONFIG.OLLAMA_PORT = int(os.getenv("OLLAMA_PORT", 11434)) # Ollama server port
|
news.md
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Allycat News
|
| 2 |
+
|
| 3 |
+
## 2025-10-17: GraphRAG
|
| 4 |
+
|
| 5 |
+
Niloy Deb Barma from [Vel Tech University](https://www.veltech.edu.in/btech-admissions/) in Chennai is the process of adding GraphRAG to AllyCat.
|
| 6 |
+
|
| 7 |
+
## 2025-08-23: Expanded Team
|
| 8 |
+
|
| 9 |
+
In addition to Sujee and Dave, we have begun working with Chirag, Nikhil and the team at [Open Governance](https://www.opgov.ai)
|
| 10 |
+
|
| 11 |
+
## 2025-07-14: Big Update
|
| 12 |
+
|
| 13 |
+
Lot of cool updates:
|
| 14 |
+
|
| 15 |
+
**Robust web crawler** ([#31](https://github.com/The-AI-Alliance/allycat/issues/31))
|
| 16 |
+
|
| 17 |
+
Completely redid web crawler. Now it
|
| 18 |
+
- is more robust and handle scenarios that made previous crawler fail.
|
| 19 |
+
- can handle multiple file types (not just text/html) correctly
|
| 20 |
+
- Handle anchor tags (`a.html#news`) in HTML files correctly
|
| 21 |
+
- pauses (customizable) between requests so to not hammer the webserver.
|
| 22 |
+
|
| 23 |
+
**using [LiteLLM](https://docs.litellm.ai/docs/) for LLM inference** ([#34](https://github.com/The-AI-Alliance/allycat/issues/34))
|
| 24 |
+
|
| 25 |
+
This allows us to seamlessly access LLMs running locally (using [ollama](https://ollama.com/)) or calling inference providers like Nebius, Replicate ..etc.
|
| 26 |
+
|
| 27 |
+
Also singinficantly simplified LLM configuration.
|
| 28 |
+
|
| 29 |
+
**Expanded support for many file types (pdf, docx)** ([#37](https://github.com/The-AI-Alliance/allycat/issues/37))
|
| 30 |
+
|
| 31 |
+
Before we just handled HTML files. Now we can download and process other popular file types - like PDF, DOCX ..etc. We use [Docling](https://github.com/docling-project/docling) for processing files.
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
**Added [uv](https://docs.astral.sh/uv/) package manager support** ([#26](https://github.com/The-AI-Alliance/allycat/issues/26))
|
| 35 |
+
|
| 36 |
+
UV will be the preferred package manager going forward. We will still maintain `requirements.txt` to support other package managers.
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
**Better config management**([#19](https://github.com/The-AI-Alliance/allycat/issues/19))
|
| 40 |
+
|
| 41 |
+
Lot of user configuration can be set using `.env` file. This simplifies config management and allows for easier and faster experimentation without changing code.
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
**Documentation update**
|
| 45 |
+
|
| 46 |
+
Various doc updates.
|
| 47 |
+
|
| 48 |
+
**Huge thanks to all the contributors**
|
| 49 |
+
|
| 50 |
+
- [Steven Pousty](https://github.com/thesteve0) ([linkedin](https://www.linkedin.com/in/thesteve0/))
|
| 51 |
+
- [Santosh Borse](https://github.com/santoshborse)
|
package-lock.json
ADDED
|
@@ -0,0 +1,523 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "allycat",
|
| 3 |
+
"lockfileVersion": 3,
|
| 4 |
+
"requires": true,
|
| 5 |
+
"packages": {
|
| 6 |
+
"": {
|
| 7 |
+
"dependencies": {
|
| 8 |
+
"dotenv": "^17.2.1",
|
| 9 |
+
"neo4j": "^2.0.0-RC2"
|
| 10 |
+
}
|
| 11 |
+
},
|
| 12 |
+
"node_modules/ajv": {
|
| 13 |
+
"version": "6.12.6",
|
| 14 |
+
"resolved": "https://registry.npmjs.org/ajv/-/ajv-6.12.6.tgz",
|
| 15 |
+
"integrity": "sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g==",
|
| 16 |
+
"license": "MIT",
|
| 17 |
+
"dependencies": {
|
| 18 |
+
"fast-deep-equal": "^3.1.1",
|
| 19 |
+
"fast-json-stable-stringify": "^2.0.0",
|
| 20 |
+
"json-schema-traverse": "^0.4.1",
|
| 21 |
+
"uri-js": "^4.2.2"
|
| 22 |
+
},
|
| 23 |
+
"funding": {
|
| 24 |
+
"type": "github",
|
| 25 |
+
"url": "https://github.com/sponsors/epoberezkin"
|
| 26 |
+
}
|
| 27 |
+
},
|
| 28 |
+
"node_modules/asn1": {
|
| 29 |
+
"version": "0.2.6",
|
| 30 |
+
"resolved": "https://registry.npmjs.org/asn1/-/asn1-0.2.6.tgz",
|
| 31 |
+
"integrity": "sha512-ix/FxPn0MDjeyJ7i/yoHGFt/EX6LyNbxSEhPPXODPL+KB0VPk86UYfL0lMdy+KCnv+fmvIzySwaK5COwqVbWTQ==",
|
| 32 |
+
"license": "MIT",
|
| 33 |
+
"dependencies": {
|
| 34 |
+
"safer-buffer": "~2.1.0"
|
| 35 |
+
}
|
| 36 |
+
},
|
| 37 |
+
"node_modules/assert-plus": {
|
| 38 |
+
"version": "1.0.0",
|
| 39 |
+
"resolved": "https://registry.npmjs.org/assert-plus/-/assert-plus-1.0.0.tgz",
|
| 40 |
+
"integrity": "sha512-NfJ4UzBCcQGLDlQq7nHxH+tv3kyZ0hHQqF5BO6J7tNJeP5do1llPr8dZ8zHonfhAu0PHAdMkSo+8o0wxg9lZWw==",
|
| 41 |
+
"license": "MIT",
|
| 42 |
+
"engines": {
|
| 43 |
+
"node": ">=0.8"
|
| 44 |
+
}
|
| 45 |
+
},
|
| 46 |
+
"node_modules/asynckit": {
|
| 47 |
+
"version": "0.4.0",
|
| 48 |
+
"resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz",
|
| 49 |
+
"integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==",
|
| 50 |
+
"license": "MIT"
|
| 51 |
+
},
|
| 52 |
+
"node_modules/aws-sign2": {
|
| 53 |
+
"version": "0.7.0",
|
| 54 |
+
"resolved": "https://registry.npmjs.org/aws-sign2/-/aws-sign2-0.7.0.tgz",
|
| 55 |
+
"integrity": "sha512-08kcGqnYf/YmjoRhfxyu+CLxBjUtHLXLXX/vUfx9l2LYzG3c1m61nrpyFUZI6zeS+Li/wWMMidD9KgrqtGq3mA==",
|
| 56 |
+
"license": "Apache-2.0",
|
| 57 |
+
"engines": {
|
| 58 |
+
"node": "*"
|
| 59 |
+
}
|
| 60 |
+
},
|
| 61 |
+
"node_modules/aws4": {
|
| 62 |
+
"version": "1.13.2",
|
| 63 |
+
"resolved": "https://registry.npmjs.org/aws4/-/aws4-1.13.2.tgz",
|
| 64 |
+
"integrity": "sha512-lHe62zvbTB5eEABUVi/AwVh0ZKY9rMMDhmm+eeyuuUQbQ3+J+fONVQOZyj+DdrvD4BY33uYniyRJ4UJIaSKAfw==",
|
| 65 |
+
"license": "MIT"
|
| 66 |
+
},
|
| 67 |
+
"node_modules/bcrypt-pbkdf": {
|
| 68 |
+
"version": "1.0.2",
|
| 69 |
+
"resolved": "https://registry.npmjs.org/bcrypt-pbkdf/-/bcrypt-pbkdf-1.0.2.tgz",
|
| 70 |
+
"integrity": "sha512-qeFIXtP4MSoi6NLqO12WfqARWWuCKi2Rn/9hJLEmtB5yTNr9DqFWkJRCf2qShWzPeAMRnOgCrq0sg/KLv5ES9w==",
|
| 71 |
+
"license": "BSD-3-Clause",
|
| 72 |
+
"dependencies": {
|
| 73 |
+
"tweetnacl": "^0.14.3"
|
| 74 |
+
}
|
| 75 |
+
},
|
| 76 |
+
"node_modules/caseless": {
|
| 77 |
+
"version": "0.12.0",
|
| 78 |
+
"resolved": "https://registry.npmjs.org/caseless/-/caseless-0.12.0.tgz",
|
| 79 |
+
"integrity": "sha512-4tYFyifaFfGacoiObjJegolkwSU4xQNGbVgUiNYVUxbQ2x2lUsFvY4hVgVzGiIe6WLOPqycWXA40l+PWsxthUw==",
|
| 80 |
+
"license": "Apache-2.0"
|
| 81 |
+
},
|
| 82 |
+
"node_modules/combined-stream": {
|
| 83 |
+
"version": "1.0.8",
|
| 84 |
+
"resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz",
|
| 85 |
+
"integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==",
|
| 86 |
+
"license": "MIT",
|
| 87 |
+
"dependencies": {
|
| 88 |
+
"delayed-stream": "~1.0.0"
|
| 89 |
+
},
|
| 90 |
+
"engines": {
|
| 91 |
+
"node": ">= 0.8"
|
| 92 |
+
}
|
| 93 |
+
},
|
| 94 |
+
"node_modules/core-util-is": {
|
| 95 |
+
"version": "1.0.2",
|
| 96 |
+
"resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.2.tgz",
|
| 97 |
+
"integrity": "sha512-3lqz5YjWTYnW6dlDa5TLaTCcShfar1e40rmcJVwCBJC6mWlFuj0eCHIElmG1g5kyuJ/GD+8Wn4FFCcz4gJPfaQ==",
|
| 98 |
+
"license": "MIT"
|
| 99 |
+
},
|
| 100 |
+
"node_modules/dashdash": {
|
| 101 |
+
"version": "1.14.1",
|
| 102 |
+
"resolved": "https://registry.npmjs.org/dashdash/-/dashdash-1.14.1.tgz",
|
| 103 |
+
"integrity": "sha512-jRFi8UDGo6j+odZiEpjazZaWqEal3w/basFjQHQEwVtZJGDpxbH1MeYluwCS8Xq5wmLJooDlMgvVarmWfGM44g==",
|
| 104 |
+
"license": "MIT",
|
| 105 |
+
"dependencies": {
|
| 106 |
+
"assert-plus": "^1.0.0"
|
| 107 |
+
},
|
| 108 |
+
"engines": {
|
| 109 |
+
"node": ">=0.10"
|
| 110 |
+
}
|
| 111 |
+
},
|
| 112 |
+
"node_modules/delayed-stream": {
|
| 113 |
+
"version": "1.0.0",
|
| 114 |
+
"resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz",
|
| 115 |
+
"integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==",
|
| 116 |
+
"license": "MIT",
|
| 117 |
+
"engines": {
|
| 118 |
+
"node": ">=0.4.0"
|
| 119 |
+
}
|
| 120 |
+
},
|
| 121 |
+
"node_modules/dotenv": {
|
| 122 |
+
"version": "17.2.1",
|
| 123 |
+
"resolved": "https://registry.npmjs.org/dotenv/-/dotenv-17.2.1.tgz",
|
| 124 |
+
"integrity": "sha512-kQhDYKZecqnM0fCnzI5eIv5L4cAe/iRI+HqMbO/hbRdTAeXDG+M9FjipUxNfbARuEg4iHIbhnhs78BCHNbSxEQ==",
|
| 125 |
+
"license": "BSD-2-Clause",
|
| 126 |
+
"engines": {
|
| 127 |
+
"node": ">=12"
|
| 128 |
+
},
|
| 129 |
+
"funding": {
|
| 130 |
+
"url": "https://dotenvx.com"
|
| 131 |
+
}
|
| 132 |
+
},
|
| 133 |
+
"node_modules/ecc-jsbn": {
|
| 134 |
+
"version": "0.1.2",
|
| 135 |
+
"resolved": "https://registry.npmjs.org/ecc-jsbn/-/ecc-jsbn-0.1.2.tgz",
|
| 136 |
+
"integrity": "sha512-eh9O+hwRHNbG4BLTjEl3nw044CkGm5X6LoaCf7LPp7UU8Qrt47JYNi6nPX8xjW97TKGKm1ouctg0QSpZe9qrnw==",
|
| 137 |
+
"license": "MIT",
|
| 138 |
+
"dependencies": {
|
| 139 |
+
"jsbn": "~0.1.0",
|
| 140 |
+
"safer-buffer": "^2.1.0"
|
| 141 |
+
}
|
| 142 |
+
},
|
| 143 |
+
"node_modules/extend": {
|
| 144 |
+
"version": "3.0.2",
|
| 145 |
+
"resolved": "https://registry.npmjs.org/extend/-/extend-3.0.2.tgz",
|
| 146 |
+
"integrity": "sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==",
|
| 147 |
+
"license": "MIT"
|
| 148 |
+
},
|
| 149 |
+
"node_modules/extsprintf": {
|
| 150 |
+
"version": "1.3.0",
|
| 151 |
+
"resolved": "https://registry.npmjs.org/extsprintf/-/extsprintf-1.3.0.tgz",
|
| 152 |
+
"integrity": "sha512-11Ndz7Nv+mvAC1j0ktTa7fAb0vLyGGX+rMHNBYQviQDGU0Hw7lhctJANqbPhu9nV9/izT/IntTgZ7Im/9LJs9g==",
|
| 153 |
+
"engines": [
|
| 154 |
+
"node >=0.6.0"
|
| 155 |
+
],
|
| 156 |
+
"license": "MIT"
|
| 157 |
+
},
|
| 158 |
+
"node_modules/fast-deep-equal": {
|
| 159 |
+
"version": "3.1.3",
|
| 160 |
+
"resolved": "https://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-3.1.3.tgz",
|
| 161 |
+
"integrity": "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q==",
|
| 162 |
+
"license": "MIT"
|
| 163 |
+
},
|
| 164 |
+
"node_modules/fast-json-stable-stringify": {
|
| 165 |
+
"version": "2.1.0",
|
| 166 |
+
"resolved": "https://registry.npmjs.org/fast-json-stable-stringify/-/fast-json-stable-stringify-2.1.0.tgz",
|
| 167 |
+
"integrity": "sha512-lhd/wF+Lk98HZoTCtlVraHtfh5XYijIjalXck7saUtuanSDyLMxnHhSXEDJqHxD7msR8D0uCmqlkwjCV8xvwHw==",
|
| 168 |
+
"license": "MIT"
|
| 169 |
+
},
|
| 170 |
+
"node_modules/forever-agent": {
|
| 171 |
+
"version": "0.6.1",
|
| 172 |
+
"resolved": "https://registry.npmjs.org/forever-agent/-/forever-agent-0.6.1.tgz",
|
| 173 |
+
"integrity": "sha512-j0KLYPhm6zeac4lz3oJ3o65qvgQCcPubiyotZrXqEaG4hNagNYO8qdlUrX5vwqv9ohqeT/Z3j6+yW067yWWdUw==",
|
| 174 |
+
"license": "Apache-2.0",
|
| 175 |
+
"engines": {
|
| 176 |
+
"node": "*"
|
| 177 |
+
}
|
| 178 |
+
},
|
| 179 |
+
"node_modules/form-data": {
|
| 180 |
+
"version": "2.3.3",
|
| 181 |
+
"resolved": "https://registry.npmjs.org/form-data/-/form-data-2.3.3.tgz",
|
| 182 |
+
"integrity": "sha512-1lLKB2Mu3aGP1Q/2eCOx0fNbRMe7XdwktwOruhfqqd0rIJWwN4Dh+E3hrPSlDCXnSR7UtZ1N38rVXm+6+MEhJQ==",
|
| 183 |
+
"license": "MIT",
|
| 184 |
+
"dependencies": {
|
| 185 |
+
"asynckit": "^0.4.0",
|
| 186 |
+
"combined-stream": "^1.0.6",
|
| 187 |
+
"mime-types": "^2.1.12"
|
| 188 |
+
},
|
| 189 |
+
"engines": {
|
| 190 |
+
"node": ">= 0.12"
|
| 191 |
+
}
|
| 192 |
+
},
|
| 193 |
+
"node_modules/getpass": {
|
| 194 |
+
"version": "0.1.7",
|
| 195 |
+
"resolved": "https://registry.npmjs.org/getpass/-/getpass-0.1.7.tgz",
|
| 196 |
+
"integrity": "sha512-0fzj9JxOLfJ+XGLhR8ze3unN0KZCgZwiSSDz168VERjK8Wl8kVSdcu2kspd4s4wtAa1y/qrVRiAA0WclVsu0ng==",
|
| 197 |
+
"license": "MIT",
|
| 198 |
+
"dependencies": {
|
| 199 |
+
"assert-plus": "^1.0.0"
|
| 200 |
+
}
|
| 201 |
+
},
|
| 202 |
+
"node_modules/har-schema": {
|
| 203 |
+
"version": "2.0.0",
|
| 204 |
+
"resolved": "https://registry.npmjs.org/har-schema/-/har-schema-2.0.0.tgz",
|
| 205 |
+
"integrity": "sha512-Oqluz6zhGX8cyRaTQlFMPw80bSJVG2x/cFb8ZPhUILGgHka9SsokCCOQgpveePerqidZOrT14ipqfJb7ILcW5Q==",
|
| 206 |
+
"license": "ISC",
|
| 207 |
+
"engines": {
|
| 208 |
+
"node": ">=4"
|
| 209 |
+
}
|
| 210 |
+
},
|
| 211 |
+
"node_modules/har-validator": {
|
| 212 |
+
"version": "5.1.5",
|
| 213 |
+
"resolved": "https://registry.npmjs.org/har-validator/-/har-validator-5.1.5.tgz",
|
| 214 |
+
"integrity": "sha512-nmT2T0lljbxdQZfspsno9hgrG3Uir6Ks5afism62poxqBM6sDnMEuPmzTq8XN0OEwqKLLdh1jQI3qyE66Nzb3w==",
|
| 215 |
+
"deprecated": "this library is no longer supported",
|
| 216 |
+
"license": "MIT",
|
| 217 |
+
"dependencies": {
|
| 218 |
+
"ajv": "^6.12.3",
|
| 219 |
+
"har-schema": "^2.0.0"
|
| 220 |
+
},
|
| 221 |
+
"engines": {
|
| 222 |
+
"node": ">=6"
|
| 223 |
+
}
|
| 224 |
+
},
|
| 225 |
+
"node_modules/http-signature": {
|
| 226 |
+
"version": "1.2.0",
|
| 227 |
+
"resolved": "https://registry.npmjs.org/http-signature/-/http-signature-1.2.0.tgz",
|
| 228 |
+
"integrity": "sha512-CAbnr6Rz4CYQkLYUtSNXxQPUH2gK8f3iWexVlsnMeD+GjlsQ0Xsy1cOX+mN3dtxYomRy21CiOzU8Uhw6OwncEQ==",
|
| 229 |
+
"license": "MIT",
|
| 230 |
+
"dependencies": {
|
| 231 |
+
"assert-plus": "^1.0.0",
|
| 232 |
+
"jsprim": "^1.2.2",
|
| 233 |
+
"sshpk": "^1.7.0"
|
| 234 |
+
},
|
| 235 |
+
"engines": {
|
| 236 |
+
"node": ">=0.8",
|
| 237 |
+
"npm": ">=1.3.7"
|
| 238 |
+
}
|
| 239 |
+
},
|
| 240 |
+
"node_modules/is-typedarray": {
|
| 241 |
+
"version": "1.0.0",
|
| 242 |
+
"resolved": "https://registry.npmjs.org/is-typedarray/-/is-typedarray-1.0.0.tgz",
|
| 243 |
+
"integrity": "sha512-cyA56iCMHAh5CdzjJIa4aohJyeO1YbwLi3Jc35MmRU6poroFjIGZzUzupGiRPOjgHg9TLu43xbpwXk523fMxKA==",
|
| 244 |
+
"license": "MIT"
|
| 245 |
+
},
|
| 246 |
+
"node_modules/isstream": {
|
| 247 |
+
"version": "0.1.2",
|
| 248 |
+
"resolved": "https://registry.npmjs.org/isstream/-/isstream-0.1.2.tgz",
|
| 249 |
+
"integrity": "sha512-Yljz7ffyPbrLpLngrMtZ7NduUgVvi6wG9RJ9IUcyCd59YQ911PBJphODUcbOVbqYfxe1wuYf/LJ8PauMRwsM/g==",
|
| 250 |
+
"license": "MIT"
|
| 251 |
+
},
|
| 252 |
+
"node_modules/jsbn": {
|
| 253 |
+
"version": "0.1.1",
|
| 254 |
+
"resolved": "https://registry.npmjs.org/jsbn/-/jsbn-0.1.1.tgz",
|
| 255 |
+
"integrity": "sha512-UVU9dibq2JcFWxQPA6KCqj5O42VOmAY3zQUfEKxU0KpTGXwNoCjkX1e13eHNvw/xPynt6pU0rZ1htjWTNTSXsg==",
|
| 256 |
+
"license": "MIT"
|
| 257 |
+
},
|
| 258 |
+
"node_modules/json-schema": {
|
| 259 |
+
"version": "0.4.0",
|
| 260 |
+
"resolved": "https://registry.npmjs.org/json-schema/-/json-schema-0.4.0.tgz",
|
| 261 |
+
"integrity": "sha512-es94M3nTIfsEPisRafak+HDLfHXnKBhV3vU5eqPcS3flIWqcxJWgXHXiey3YrpaNsanY5ei1VoYEbOzijuq9BA==",
|
| 262 |
+
"license": "(AFL-2.1 OR BSD-3-Clause)"
|
| 263 |
+
},
|
| 264 |
+
"node_modules/json-schema-traverse": {
|
| 265 |
+
"version": "0.4.1",
|
| 266 |
+
"resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz",
|
| 267 |
+
"integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==",
|
| 268 |
+
"license": "MIT"
|
| 269 |
+
},
|
| 270 |
+
"node_modules/json-stringify-safe": {
|
| 271 |
+
"version": "5.0.1",
|
| 272 |
+
"resolved": "https://registry.npmjs.org/json-stringify-safe/-/json-stringify-safe-5.0.1.tgz",
|
| 273 |
+
"integrity": "sha512-ZClg6AaYvamvYEE82d3Iyd3vSSIjQ+odgjaTzRuO3s7toCdFKczob2i0zCh7JE8kWn17yvAWhUVxvqGwUalsRA==",
|
| 274 |
+
"license": "ISC"
|
| 275 |
+
},
|
| 276 |
+
"node_modules/jsprim": {
|
| 277 |
+
"version": "1.4.2",
|
| 278 |
+
"resolved": "https://registry.npmjs.org/jsprim/-/jsprim-1.4.2.tgz",
|
| 279 |
+
"integrity": "sha512-P2bSOMAc/ciLz6DzgjVlGJP9+BrJWu5UDGK70C2iweC5QBIeFf0ZXRvGjEj2uYgrY2MkAAhsSWHDWlFtEroZWw==",
|
| 280 |
+
"license": "MIT",
|
| 281 |
+
"dependencies": {
|
| 282 |
+
"assert-plus": "1.0.0",
|
| 283 |
+
"extsprintf": "1.3.0",
|
| 284 |
+
"json-schema": "0.4.0",
|
| 285 |
+
"verror": "1.10.0"
|
| 286 |
+
},
|
| 287 |
+
"engines": {
|
| 288 |
+
"node": ">=0.6.0"
|
| 289 |
+
}
|
| 290 |
+
},
|
| 291 |
+
"node_modules/mime-db": {
|
| 292 |
+
"version": "1.52.0",
|
| 293 |
+
"resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz",
|
| 294 |
+
"integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==",
|
| 295 |
+
"license": "MIT",
|
| 296 |
+
"engines": {
|
| 297 |
+
"node": ">= 0.6"
|
| 298 |
+
}
|
| 299 |
+
},
|
| 300 |
+
"node_modules/mime-types": {
|
| 301 |
+
"version": "2.1.35",
|
| 302 |
+
"resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz",
|
| 303 |
+
"integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==",
|
| 304 |
+
"license": "MIT",
|
| 305 |
+
"dependencies": {
|
| 306 |
+
"mime-db": "1.52.0"
|
| 307 |
+
},
|
| 308 |
+
"engines": {
|
| 309 |
+
"node": ">= 0.6"
|
| 310 |
+
}
|
| 311 |
+
},
|
| 312 |
+
"node_modules/neo4j": {
|
| 313 |
+
"version": "2.0.0-RC2",
|
| 314 |
+
"resolved": "https://registry.npmjs.org/neo4j/-/neo4j-2.0.0-RC2.tgz",
|
| 315 |
+
"integrity": "sha512-TTTRwv8t3S0Mp6rVtgY4RNt+SCSl+ccuXhP6DmXERtNp5Vs8LlJ85uZiGJKcT74Xthqc4ihl517+bBOqQJxhNA==",
|
| 316 |
+
"license": "Apache-2.0",
|
| 317 |
+
"dependencies": {
|
| 318 |
+
"request": "^2.27.0",
|
| 319 |
+
"underscore": "1.7.x"
|
| 320 |
+
},
|
| 321 |
+
"engines": {
|
| 322 |
+
"node": ">= 0.10"
|
| 323 |
+
}
|
| 324 |
+
},
|
| 325 |
+
"node_modules/oauth-sign": {
|
| 326 |
+
"version": "0.9.0",
|
| 327 |
+
"resolved": "https://registry.npmjs.org/oauth-sign/-/oauth-sign-0.9.0.tgz",
|
| 328 |
+
"integrity": "sha512-fexhUFFPTGV8ybAtSIGbV6gOkSv8UtRbDBnAyLQw4QPKkgNlsH2ByPGtMUqdWkos6YCRmAqViwgZrJc/mRDzZQ==",
|
| 329 |
+
"license": "Apache-2.0",
|
| 330 |
+
"engines": {
|
| 331 |
+
"node": "*"
|
| 332 |
+
}
|
| 333 |
+
},
|
| 334 |
+
"node_modules/performance-now": {
|
| 335 |
+
"version": "2.1.0",
|
| 336 |
+
"resolved": "https://registry.npmjs.org/performance-now/-/performance-now-2.1.0.tgz",
|
| 337 |
+
"integrity": "sha512-7EAHlyLHI56VEIdK57uwHdHKIaAGbnXPiw0yWbarQZOKaKpvUIgW0jWRVLiatnM+XXlSwsanIBH/hzGMJulMow==",
|
| 338 |
+
"license": "MIT"
|
| 339 |
+
},
|
| 340 |
+
"node_modules/psl": {
|
| 341 |
+
"version": "1.15.0",
|
| 342 |
+
"resolved": "https://registry.npmjs.org/psl/-/psl-1.15.0.tgz",
|
| 343 |
+
"integrity": "sha512-JZd3gMVBAVQkSs6HdNZo9Sdo0LNcQeMNP3CozBJb3JYC/QUYZTnKxP+f8oWRX4rHP5EurWxqAHTSwUCjlNKa1w==",
|
| 344 |
+
"license": "MIT",
|
| 345 |
+
"dependencies": {
|
| 346 |
+
"punycode": "^2.3.1"
|
| 347 |
+
},
|
| 348 |
+
"funding": {
|
| 349 |
+
"url": "https://github.com/sponsors/lupomontero"
|
| 350 |
+
}
|
| 351 |
+
},
|
| 352 |
+
"node_modules/punycode": {
|
| 353 |
+
"version": "2.3.1",
|
| 354 |
+
"resolved": "https://registry.npmjs.org/punycode/-/punycode-2.3.1.tgz",
|
| 355 |
+
"integrity": "sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg==",
|
| 356 |
+
"license": "MIT",
|
| 357 |
+
"engines": {
|
| 358 |
+
"node": ">=6"
|
| 359 |
+
}
|
| 360 |
+
},
|
| 361 |
+
"node_modules/qs": {
|
| 362 |
+
"version": "6.5.3",
|
| 363 |
+
"resolved": "https://registry.npmjs.org/qs/-/qs-6.5.3.tgz",
|
| 364 |
+
"integrity": "sha512-qxXIEh4pCGfHICj1mAJQ2/2XVZkjCDTcEgfoSQxc/fYivUZxTkk7L3bDBJSoNrEzXI17oUO5Dp07ktqE5KzczA==",
|
| 365 |
+
"license": "BSD-3-Clause",
|
| 366 |
+
"engines": {
|
| 367 |
+
"node": ">=0.6"
|
| 368 |
+
}
|
| 369 |
+
},
|
| 370 |
+
"node_modules/request": {
|
| 371 |
+
"version": "2.88.2",
|
| 372 |
+
"resolved": "https://registry.npmjs.org/request/-/request-2.88.2.tgz",
|
| 373 |
+
"integrity": "sha512-MsvtOrfG9ZcrOwAW+Qi+F6HbD0CWXEh9ou77uOb7FM2WPhwT7smM833PzanhJLsgXjN89Ir6V2PczXNnMpwKhw==",
|
| 374 |
+
"deprecated": "request has been deprecated, see https://github.com/request/request/issues/3142",
|
| 375 |
+
"license": "Apache-2.0",
|
| 376 |
+
"dependencies": {
|
| 377 |
+
"aws-sign2": "~0.7.0",
|
| 378 |
+
"aws4": "^1.8.0",
|
| 379 |
+
"caseless": "~0.12.0",
|
| 380 |
+
"combined-stream": "~1.0.6",
|
| 381 |
+
"extend": "~3.0.2",
|
| 382 |
+
"forever-agent": "~0.6.1",
|
| 383 |
+
"form-data": "~2.3.2",
|
| 384 |
+
"har-validator": "~5.1.3",
|
| 385 |
+
"http-signature": "~1.2.0",
|
| 386 |
+
"is-typedarray": "~1.0.0",
|
| 387 |
+
"isstream": "~0.1.2",
|
| 388 |
+
"json-stringify-safe": "~5.0.1",
|
| 389 |
+
"mime-types": "~2.1.19",
|
| 390 |
+
"oauth-sign": "~0.9.0",
|
| 391 |
+
"performance-now": "^2.1.0",
|
| 392 |
+
"qs": "~6.5.2",
|
| 393 |
+
"safe-buffer": "^5.1.2",
|
| 394 |
+
"tough-cookie": "~2.5.0",
|
| 395 |
+
"tunnel-agent": "^0.6.0",
|
| 396 |
+
"uuid": "^3.3.2"
|
| 397 |
+
},
|
| 398 |
+
"engines": {
|
| 399 |
+
"node": ">= 6"
|
| 400 |
+
}
|
| 401 |
+
},
|
| 402 |
+
"node_modules/safe-buffer": {
|
| 403 |
+
"version": "5.2.1",
|
| 404 |
+
"resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz",
|
| 405 |
+
"integrity": "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==",
|
| 406 |
+
"funding": [
|
| 407 |
+
{
|
| 408 |
+
"type": "github",
|
| 409 |
+
"url": "https://github.com/sponsors/feross"
|
| 410 |
+
},
|
| 411 |
+
{
|
| 412 |
+
"type": "patreon",
|
| 413 |
+
"url": "https://www.patreon.com/feross"
|
| 414 |
+
},
|
| 415 |
+
{
|
| 416 |
+
"type": "consulting",
|
| 417 |
+
"url": "https://feross.org/support"
|
| 418 |
+
}
|
| 419 |
+
],
|
| 420 |
+
"license": "MIT"
|
| 421 |
+
},
|
| 422 |
+
"node_modules/safer-buffer": {
|
| 423 |
+
"version": "2.1.2",
|
| 424 |
+
"resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz",
|
| 425 |
+
"integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==",
|
| 426 |
+
"license": "MIT"
|
| 427 |
+
},
|
| 428 |
+
"node_modules/sshpk": {
|
| 429 |
+
"version": "1.18.0",
|
| 430 |
+
"resolved": "https://registry.npmjs.org/sshpk/-/sshpk-1.18.0.tgz",
|
| 431 |
+
"integrity": "sha512-2p2KJZTSqQ/I3+HX42EpYOa2l3f8Erv8MWKsy2I9uf4wA7yFIkXRffYdsx86y6z4vHtV8u7g+pPlr8/4ouAxsQ==",
|
| 432 |
+
"license": "MIT",
|
| 433 |
+
"dependencies": {
|
| 434 |
+
"asn1": "~0.2.3",
|
| 435 |
+
"assert-plus": "^1.0.0",
|
| 436 |
+
"bcrypt-pbkdf": "^1.0.0",
|
| 437 |
+
"dashdash": "^1.12.0",
|
| 438 |
+
"ecc-jsbn": "~0.1.1",
|
| 439 |
+
"getpass": "^0.1.1",
|
| 440 |
+
"jsbn": "~0.1.0",
|
| 441 |
+
"safer-buffer": "^2.0.2",
|
| 442 |
+
"tweetnacl": "~0.14.0"
|
| 443 |
+
},
|
| 444 |
+
"bin": {
|
| 445 |
+
"sshpk-conv": "bin/sshpk-conv",
|
| 446 |
+
"sshpk-sign": "bin/sshpk-sign",
|
| 447 |
+
"sshpk-verify": "bin/sshpk-verify"
|
| 448 |
+
},
|
| 449 |
+
"engines": {
|
| 450 |
+
"node": ">=0.10.0"
|
| 451 |
+
}
|
| 452 |
+
},
|
| 453 |
+
"node_modules/tough-cookie": {
|
| 454 |
+
"version": "2.5.0",
|
| 455 |
+
"resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-2.5.0.tgz",
|
| 456 |
+
"integrity": "sha512-nlLsUzgm1kfLXSXfRZMc1KLAugd4hqJHDTvc2hDIwS3mZAfMEuMbc03SujMF+GEcpaX/qboeycw6iO8JwVv2+g==",
|
| 457 |
+
"license": "BSD-3-Clause",
|
| 458 |
+
"dependencies": {
|
| 459 |
+
"psl": "^1.1.28",
|
| 460 |
+
"punycode": "^2.1.1"
|
| 461 |
+
},
|
| 462 |
+
"engines": {
|
| 463 |
+
"node": ">=0.8"
|
| 464 |
+
}
|
| 465 |
+
},
|
| 466 |
+
"node_modules/tunnel-agent": {
|
| 467 |
+
"version": "0.6.0",
|
| 468 |
+
"resolved": "https://registry.npmjs.org/tunnel-agent/-/tunnel-agent-0.6.0.tgz",
|
| 469 |
+
"integrity": "sha512-McnNiV1l8RYeY8tBgEpuodCC1mLUdbSN+CYBL7kJsJNInOP8UjDDEwdk6Mw60vdLLrr5NHKZhMAOSrR2NZuQ+w==",
|
| 470 |
+
"license": "Apache-2.0",
|
| 471 |
+
"dependencies": {
|
| 472 |
+
"safe-buffer": "^5.0.1"
|
| 473 |
+
},
|
| 474 |
+
"engines": {
|
| 475 |
+
"node": "*"
|
| 476 |
+
}
|
| 477 |
+
},
|
| 478 |
+
"node_modules/tweetnacl": {
|
| 479 |
+
"version": "0.14.5",
|
| 480 |
+
"resolved": "https://registry.npmjs.org/tweetnacl/-/tweetnacl-0.14.5.tgz",
|
| 481 |
+
"integrity": "sha512-KXXFFdAbFXY4geFIwoyNK+f5Z1b7swfXABfL7HXCmoIWMKU3dmS26672A4EeQtDzLKy7SXmfBu51JolvEKwtGA==",
|
| 482 |
+
"license": "Unlicense"
|
| 483 |
+
},
|
| 484 |
+
"node_modules/underscore": {
|
| 485 |
+
"version": "1.7.0",
|
| 486 |
+
"resolved": "https://registry.npmjs.org/underscore/-/underscore-1.7.0.tgz",
|
| 487 |
+
"integrity": "sha512-cp0oQQyZhUM1kpJDLdGO1jPZHgS/MpzoWYfe9+CM2h/QGDZlqwT2T3YGukuBdaNJ/CAPoeyAZRRHz8JFo176vA=="
|
| 488 |
+
},
|
| 489 |
+
"node_modules/uri-js": {
|
| 490 |
+
"version": "4.4.1",
|
| 491 |
+
"resolved": "https://registry.npmjs.org/uri-js/-/uri-js-4.4.1.tgz",
|
| 492 |
+
"integrity": "sha512-7rKUyy33Q1yc98pQ1DAmLtwX109F7TIfWlW1Ydo8Wl1ii1SeHieeh0HHfPeL2fMXK6z0s8ecKs9frCuLJvndBg==",
|
| 493 |
+
"license": "BSD-2-Clause",
|
| 494 |
+
"dependencies": {
|
| 495 |
+
"punycode": "^2.1.0"
|
| 496 |
+
}
|
| 497 |
+
},
|
| 498 |
+
"node_modules/uuid": {
|
| 499 |
+
"version": "3.4.0",
|
| 500 |
+
"resolved": "https://registry.npmjs.org/uuid/-/uuid-3.4.0.tgz",
|
| 501 |
+
"integrity": "sha512-HjSDRw6gZE5JMggctHBcjVak08+KEVhSIiDzFnT9S9aegmp85S/bReBVTb4QTFaRNptJ9kuYaNhnbNEOkbKb/A==",
|
| 502 |
+
"deprecated": "Please upgrade to version 7 or higher. Older versions may use Math.random() in certain circumstances, which is known to be problematic. See https://v8.dev/blog/math-random for details.",
|
| 503 |
+
"license": "MIT",
|
| 504 |
+
"bin": {
|
| 505 |
+
"uuid": "bin/uuid"
|
| 506 |
+
}
|
| 507 |
+
},
|
| 508 |
+
"node_modules/verror": {
|
| 509 |
+
"version": "1.10.0",
|
| 510 |
+
"resolved": "https://registry.npmjs.org/verror/-/verror-1.10.0.tgz",
|
| 511 |
+
"integrity": "sha512-ZZKSmDAEFOijERBLkmYfJ+vmk3w+7hOLYDNkRCuRuMJGEmqYNCNLyBBFwWKVMhfwaEF3WOd0Zlw86U/WC/+nYw==",
|
| 512 |
+
"engines": [
|
| 513 |
+
"node >=0.6.0"
|
| 514 |
+
],
|
| 515 |
+
"license": "MIT",
|
| 516 |
+
"dependencies": {
|
| 517 |
+
"assert-plus": "^1.0.0",
|
| 518 |
+
"core-util-is": "1.0.2",
|
| 519 |
+
"extsprintf": "^1.2.0"
|
| 520 |
+
}
|
| 521 |
+
}
|
| 522 |
+
}
|
| 523 |
+
}
|
package.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dependencies": {
|
| 3 |
+
"dotenv": "^17.2.1",
|
| 4 |
+
"neo4j": "^2.0.0-RC2"
|
| 5 |
+
}
|
| 6 |
+
}
|
pyproject.toml
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "allycat-1"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = "Add your description here"
|
| 5 |
+
readme = "README.md"
|
| 6 |
+
requires-python = ">=3.11"
|
| 7 |
+
dependencies = [
|
| 8 |
+
"chainlit>=2.2.1",
|
| 9 |
+
"docling>=2.41.0",
|
| 10 |
+
"flask>=3.1.1",
|
| 11 |
+
"humanfriendly>=10.0",
|
| 12 |
+
"litellm>=1.74.3",
|
| 13 |
+
"llama-index>=0.12.48",
|
| 14 |
+
"llama-index-embeddings-huggingface>=0.5.5",
|
| 15 |
+
"llama-index-llms-litellm>=0.5.1",
|
| 16 |
+
"llama-index-vector-stores-milvus>=0.8.5",
|
| 17 |
+
"milvus-lite>=2.5.1",
|
| 18 |
+
"mimetypes-magic>=0.4.30",
|
| 19 |
+
"nest-asyncio>=1.6.0",
|
| 20 |
+
"pandas>=2.3.1",
|
| 21 |
+
"pymilvus>=2.5.12",
|
| 22 |
+
"tqdm>=4.67.1",
|
| 23 |
+
]
|
| 24 |
+
|
| 25 |
+
[dependency-groups]
|
| 26 |
+
dev = [
|
| 27 |
+
"ipykernel>=6.29.5",
|
| 28 |
+
]
|
query_utils.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def tweak_query(query : str, model : str):
|
| 2 |
+
"""
|
| 3 |
+
for qwen3 models, turn off thinking
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
# Check if the model is qwen3
|
| 7 |
+
if 'qwen3' in model:
|
| 8 |
+
# Check if the query contains '/no_think'
|
| 9 |
+
if '/no_think' not in query:
|
| 10 |
+
# Append '/no_think' to the query
|
| 11 |
+
query += '\n/no_think'
|
| 12 |
+
return query
|
requirements-build.txt
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ============================================
|
| 2 |
+
# BUILD/PIPELINE REQUIREMENTS - Full Pipeline
|
| 3 |
+
# ============================================
|
| 4 |
+
# These packages are ONLY needed for running the pipeline:
|
| 5 |
+
# - Crawling websites (1_crawl_site.py)
|
| 6 |
+
# - Processing files (2_process_files.py)
|
| 7 |
+
# - Graph extraction (2b_process_graph_phase*.py)
|
| 8 |
+
# - Saving to databases (3*.py)
|
| 9 |
+
#
|
| 10 |
+
# After pipeline completes, these can be REMOVED to save ~300-500 MB RAM
|
| 11 |
+
|
| 12 |
+
# ============================================
|
| 13 |
+
# Document Processing - REMOVE AFTER PIPELINE
|
| 14 |
+
# ============================================
|
| 15 |
+
docling # PDF/HTML to markdown conversion (~100 MB)
|
| 16 |
+
html2text # HTML processing (~10 MB)
|
| 17 |
+
|
| 18 |
+
# ============================================
|
| 19 |
+
# Graph Community Detection - REMOVE AFTER PIPELINE
|
| 20 |
+
# ============================================
|
| 21 |
+
igraph # Graph analysis library (~50 MB)
|
| 22 |
+
leidenalg # Leiden algorithm for communities (~30 MB)
|
| 23 |
+
graspologic # Graph statistics (~40 MB)
|
| 24 |
+
|
| 25 |
+
# ============================================
|
| 26 |
+
# Development Tools - REMOVE AFTER PIPELINE
|
| 27 |
+
# ============================================
|
| 28 |
+
milvus-lite==2.4.11 # Local Milvus server (not needed if using cloud Zilliz) (~100 MB)
|
| 29 |
+
tqdm # Progress bars (nice to have, but not essential)
|
| 30 |
+
ipykernel # Jupyter support (only for development)
|
| 31 |
+
fastmcp # MCP support (only for development)
|
| 32 |
+
|
| 33 |
+
# ============================================
|
| 34 |
+
# Total Savings if Removed: ~350-500 MB
|
| 35 |
+
# ============================================
|
requirements-docker-cloud.txt
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core document processing (lightweight)
|
| 2 |
+
docling
|
| 3 |
+
html2text
|
| 4 |
+
|
| 5 |
+
# Asyncio support
|
| 6 |
+
nest_asyncio
|
| 7 |
+
|
| 8 |
+
# PyTorch CPU-only (much smaller than GPU)
|
| 9 |
+
--extra-index-url https://download.pytorch.org/whl/cpu
|
| 10 |
+
torch==2.6.0+cpu
|
| 11 |
+
|
| 12 |
+
# JSON parsing
|
| 13 |
+
orjson>=3.8.0
|
| 14 |
+
json-repair>=0.7.0
|
| 15 |
+
|
| 16 |
+
# Vector DB - Cloud Zilliz support
|
| 17 |
+
pymilvus==2.5.5
|
| 18 |
+
|
| 19 |
+
# LLM Integration
|
| 20 |
+
litellm
|
| 21 |
+
|
| 22 |
+
# LlamaIndex
|
| 23 |
+
llama-index
|
| 24 |
+
llama-index-embeddings-huggingface
|
| 25 |
+
llama-index-llms-litellm
|
| 26 |
+
llama-index-vector-stores-milvus==0.5.0
|
| 27 |
+
|
| 28 |
+
# Graph Database
|
| 29 |
+
neo4j
|
| 30 |
+
networkx
|
| 31 |
+
python-louvain
|
| 32 |
+
igraph
|
| 33 |
+
leidenalg
|
| 34 |
+
graspologic
|
| 35 |
+
|
| 36 |
+
# Graph LLM APIs
|
| 37 |
+
google-generativeai
|
| 38 |
+
openai>=1.0.0
|
| 39 |
+
fastmcp
|
| 40 |
+
|
| 41 |
+
# Web Framework
|
| 42 |
+
flask==2.3.3
|
| 43 |
+
chainlit
|
| 44 |
+
|
| 45 |
+
# Utilities
|
| 46 |
+
python-dotenv
|
| 47 |
+
humanfriendly
|
| 48 |
+
pandas
|
| 49 |
+
tqdm
|
requirements-docker.txt
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core document processing
|
| 2 |
+
docling
|
| 3 |
+
html2text
|
| 4 |
+
|
| 5 |
+
# Asyncio support
|
| 6 |
+
nest_asyncio
|
| 7 |
+
|
| 8 |
+
# PyTorch CPU-only (much smaller than GPU)
|
| 9 |
+
--extra-index-url https://download.pytorch.org/whl/cpu
|
| 10 |
+
torch==2.6.0+cpu
|
| 11 |
+
|
| 12 |
+
# JSON parsing
|
| 13 |
+
orjson>=3.8.0
|
| 14 |
+
json-repair>=0.7.0
|
| 15 |
+
|
| 16 |
+
# Vector DB - Supports both cloud (Zilliz) and local (Milvus Lite)
|
| 17 |
+
pymilvus==2.5.5
|
| 18 |
+
milvus-lite==2.4.11
|
| 19 |
+
|
| 20 |
+
# LLM Integration
|
| 21 |
+
litellm
|
| 22 |
+
|
| 23 |
+
# LlamaIndex
|
| 24 |
+
llama-index
|
| 25 |
+
llama-index-embeddings-huggingface
|
| 26 |
+
llama-index-llms-litellm
|
| 27 |
+
llama-index-llms-ollama
|
| 28 |
+
llama-index-vector-stores-milvus==0.5.0
|
| 29 |
+
|
| 30 |
+
# Graph Database
|
| 31 |
+
neo4j
|
| 32 |
+
networkx
|
| 33 |
+
python-louvain
|
| 34 |
+
igraph
|
| 35 |
+
leidenalg
|
| 36 |
+
graspologic
|
| 37 |
+
|
| 38 |
+
# Graph LLM APIs
|
| 39 |
+
google-generativeai
|
| 40 |
+
openai>=1.0.0
|
| 41 |
+
fastmcp
|
| 42 |
+
|
| 43 |
+
# Web Frameworks
|
| 44 |
+
flask==2.3.3
|
| 45 |
+
chainlit
|
| 46 |
+
|
| 47 |
+
# Utilities
|
| 48 |
+
python-dotenv
|
| 49 |
+
humanfriendly
|
| 50 |
+
pandas
|
| 51 |
+
tqdm
|
requirements-runtime.txt
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ============================================
|
| 2 |
+
# RUNTIME REQUIREMENTS - Flask GraphRAG App
|
| 3 |
+
# ============================================
|
| 4 |
+
# These are the MINIMAL packages needed to run the Flask GraphRAG app
|
| 5 |
+
# after the pipeline has been completed.
|
| 6 |
+
# Use this for production deployments to save ~500 MB RAM
|
| 7 |
+
|
| 8 |
+
# ============================================
|
| 9 |
+
# Core Runtime - DO NOT REMOVE
|
| 10 |
+
# ============================================
|
| 11 |
+
|
| 12 |
+
# Asyncio support
|
| 13 |
+
nest_asyncio
|
| 14 |
+
|
| 15 |
+
# Advanced JSON parsing (for LLM responses)
|
| 16 |
+
orjson>=3.8.0
|
| 17 |
+
json-repair>=0.7.0
|
| 18 |
+
|
| 19 |
+
# Vector Database (client-only, lightweight)
|
| 20 |
+
pymilvus==2.5.5
|
| 21 |
+
|
| 22 |
+
# LLM Integration
|
| 23 |
+
litellm
|
| 24 |
+
|
| 25 |
+
# LlamaIndex Core (for querying)
|
| 26 |
+
llama-index
|
| 27 |
+
llama-index-embeddings-huggingface
|
| 28 |
+
llama-index-llms-litellm
|
| 29 |
+
llama-index-llms-ollama
|
| 30 |
+
llama-index-vector-stores-milvus==0.5.0
|
| 31 |
+
|
| 32 |
+
# Graph Database (client-only)
|
| 33 |
+
neo4j
|
| 34 |
+
|
| 35 |
+
# Graph Analysis (for community queries)
|
| 36 |
+
networkx
|
| 37 |
+
|
| 38 |
+
# LLM APIs (for query synthesis)
|
| 39 |
+
google-generativeai
|
| 40 |
+
openai>=1.0.0
|
| 41 |
+
|
| 42 |
+
# Web Framework
|
| 43 |
+
flask==2.3.3
|
| 44 |
+
chainlit # Chat UI (only if using chainlit apps)
|
| 45 |
+
|
| 46 |
+
# Utilities
|
| 47 |
+
python-dotenv
|
| 48 |
+
humanfriendly
|
| 49 |
+
pandas
|
| 50 |
+
|
| 51 |
+
# ============================================
|
| 52 |
+
# Embedding Model - LARGE (500+ MB)
|
| 53 |
+
# ============================================
|
| 54 |
+
# PyTorch CPU (required for sentence-transformers/embeddings)
|
| 55 |
+
--extra-index-url https://download.pytorch.org/whl/cpu
|
| 56 |
+
torch==2.6.0+cpu
|
| 57 |
+
|
| 58 |
+
# Note: For cloud embeddings in future, remove torch + sentence-transformers
|
| 59 |
+
# to save ~500 MB
|
requirements.txt
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core document processing
|
| 2 |
+
docling
|
| 3 |
+
html2text
|
| 4 |
+
# Asyncio support
|
| 5 |
+
nest_asyncio
|
| 6 |
+
|
| 7 |
+
# PyTorch CPU-only (much smaller than GPU)
|
| 8 |
+
--extra-index-url https://download.pytorch.org/whl/cpu
|
| 9 |
+
torch==2.6.0+cpu
|
| 10 |
+
|
| 11 |
+
# Advanced JSON parsing
|
| 12 |
+
orjson>=3.8.0
|
| 13 |
+
json-repair>=0.7.0
|
| 14 |
+
|
| 15 |
+
# Vector Database
|
| 16 |
+
pymilvus==2.5.5
|
| 17 |
+
milvus-lite==2.4.11
|
| 18 |
+
|
| 19 |
+
# LLM Integration
|
| 20 |
+
litellm
|
| 21 |
+
|
| 22 |
+
# LlamaIndex
|
| 23 |
+
llama-index
|
| 24 |
+
llama-index-embeddings-huggingface
|
| 25 |
+
llama-index-llms-litellm
|
| 26 |
+
llama-index-llms-ollama
|
| 27 |
+
llama-index-vector-stores-milvus==0.5.0
|
| 28 |
+
|
| 29 |
+
# Graph Database
|
| 30 |
+
neo4j
|
| 31 |
+
networkx
|
| 32 |
+
python-louvain
|
| 33 |
+
igraph
|
| 34 |
+
leidenalg
|
| 35 |
+
graspologic
|
| 36 |
+
|
| 37 |
+
# Graph LLM APIs
|
| 38 |
+
google-generativeai
|
| 39 |
+
openai>=1.0.0
|
| 40 |
+
fastmcp
|
| 41 |
+
|
| 42 |
+
# Web Frameworks
|
| 43 |
+
flask==2.3.3
|
| 44 |
+
chainlit
|
| 45 |
+
|
| 46 |
+
# Utilities
|
| 47 |
+
python-dotenv
|
| 48 |
+
humanfriendly
|
| 49 |
+
pandas
|
| 50 |
+
tqdm
|
| 51 |
+
ipykernel
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|