niloydebbarma commited on
Commit
a7d2416
·
verified ·
1 Parent(s): 34ad93f

Upload 50 files

Browse files
.env.cloud.sample ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ============================================
2
+ # AllyCAT GraphRAG - Cloud Configuration
3
+ # ============================================
4
+ # This configuration uses cloud services for all components
5
+ # Recommended for production and free-tier deployments
6
+ # Docker image size: ~800 MB
7
+
8
+ # ============================================
9
+ # Pipeline Automation (Docker Only)
10
+ # ============================================
11
+ # Set to 'true' to automatically run the complete pipeline on container startup
12
+ # This will: crawl → process → save to vector DB → process graph → save to graph DB
13
+ # Recommended for cloud deployments (Heroku, AWS, Google Cloud Run)
14
+ AUTO_RUN_PIPELINE=true
15
+
16
+ # Website to crawl (required if AUTO_RUN_PIPELINE=true)
17
+ WEBSITE_URL=https://your-website.com
18
+
19
+ # Memory Optimization: Remove pipeline dependencies after completion
20
+ # Saves ~350-500 MB RAM - Highly recommended for 1GB containers
21
+ # Enables deployment on cheaper plans: DigitalOcean $12/mo (1GB) vs $25/mo (2GB)
22
+ CLEANUP_PIPELINE_DEPS=true
23
+
24
+ # ============================================
25
+ # LLM Configuration - Cloud Mode
26
+ # ============================================
27
+ LLM_RUN_ENV=cloud
28
+ # Choose your preferred cloud LLM (via LiteLLM)
29
+ LLM_MODEL=cerebras/llama3.1-8b
30
+ # Alternative models:
31
+ # LLM_MODEL=gemini/gemini-1.5-flash
32
+
33
+ # ============================================
34
+ # LLM API Keys (Set at least one)
35
+ # ============================================
36
+ # Cerebras (Fast, free tier available)
37
+ CEREBRAS_API_KEY=your_cerebras_api_key_here
38
+
39
+ # Google Gemini (Good for graph extraction)
40
+ GEMINI_API_KEY=your_gemini_api_key_here
41
+
42
+ # Nebius (Alternative provider)
43
+ NEBIUS_API_KEY=your_nebius_api_key_here
44
+
45
+ # ============================================
46
+ # Vector Database - Zilliz Cloud
47
+ # ============================================
48
+ VECTOR_DB_TYPE=cloud_zilliz
49
+ ZILLIZ_CLUSTER_ENDPOINT=https://your-cluster.zilliz.cloud
50
+ ZILLIZ_TOKEN=your_zilliz_token_here
51
+
52
+ # ============================================
53
+ # Graph Database - Neo4j Aura Cloud
54
+ # ============================================
55
+ NEO4J_URI=neo4j+s://your-instance.databases.neo4j.io
56
+ NEO4J_USERNAME=neo4j
57
+ NEO4J_PASSWORD=your_neo4j_password_here
58
+ NEO4J_DATABASE=neo4j
59
+
60
+ # ============================================
61
+ # Application Settings
62
+ # ============================================
63
+ # Choose app type: flask_graph, chainlit_graph, flask
64
+ APP_TYPE=flask_graph
65
+
66
+ # ============================================
67
+ # Port Configuration
68
+ # ============================================
69
+ # Flask Applications
70
+ FLASK_VECTOR_PORT=8081 # app_flask.py (vector-only RAG)
71
+ FLASK_GRAPH_PORT=8080 # app_flask_graph.py (GraphRAG - default)
72
+
73
+ # Chainlit Applications
74
+ CHAINLIT_VECTOR_PORT=8082 # app_chainlit.py
75
+ CHAINLIT_GRAPH_PORT=8083 # app_chainlit_graph.py
76
+
77
+ # Docker & External Services
78
+ DOCKER_PORT=8080 # External Docker exposed port (host side)
79
+ DOCKER_APP_PORT=8080 # Internal container port (container side, set to match your APP_TYPE)
80
+ OLLAMA_PORT=11434 # Ollama server port (not used in cloud mode)
81
+
82
+ # Workspace directory
83
+ # For native execution: use relative path 'workspace'
84
+ # For Docker: use absolute path '/allycat/workspace'
85
+ WORKSPACE_DIR=/allycat/workspace
86
+
87
+ # ============================================
88
+ # Website Crawling Configuration
89
+ # ============================================
90
+ WEBSITE_URL=https://example.com
91
+ CRAWL_MAX_DOWNLOADS=100
92
+ CRAWL_MAX_DEPTH=3
93
+ WAITTIME_BETWEEN_REQUESTS=0.1
94
+
95
+ # ============================================
96
+ # Embedding Model Configuration
97
+ # ============================================
98
+ EMBEDDING_MODEL=ibm-granite/granite-embedding-30m-english
99
+ EMBEDDING_LENGTH=384
100
+ HF_ENDPOINT=https://hf-mirror.com
101
+
102
+ # ============================================
103
+ # Chunking Configuration
104
+ # ============================================
105
+ CHUNK_SIZE=512
106
+ CHUNK_OVERLAP=20
107
+
108
+ # ============================================
109
+ # Graph Extraction Configuration
110
+ # ============================================
111
+ GRAPH_MIN_ENTITIES=5
112
+ GRAPH_MAX_ENTITIES=15
113
+ GRAPH_MIN_RELATIONSHIPS=3
114
+ GRAPH_MAX_RELATIONSHIPS=8
115
+ GRAPH_MIN_CONFIDENCE=0.8
116
+ GRAPH_MAX_CONTENT_CHARS=12000
117
+ GRAPH_SENTENCE_BOUNDARY_RATIO=0.7
118
+
119
+ # ============================================
120
+ # UI Settings
121
+ # ============================================
122
+ UI_STARTER_PROMPTS=What is this website? | What are upcoming events? | Who are the partners?
.env.hybrid.sample ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ============================================
2
+ # AllyCAT GraphRAG - Hybrid Configuration
3
+ # ============================================
4
+ # This configuration uses cloud LLMs with local vector database
5
+ # Recommended for: Privacy-focused deployments with cloud AI benefits
6
+ # Docker image size: ~1.5 GB
7
+
8
+ # ============================================
9
+ # Pipeline Automation (Docker Only)
10
+ # ============================================
11
+ # Set to 'true' to automatically run the complete pipeline on container startup
12
+ # This will: crawl → process → save to vector DB → process graph → save to graph DB
13
+ AUTO_RUN_PIPELINE=false
14
+
15
+ # Website to crawl (required if AUTO_RUN_PIPELINE=true)
16
+ WEBSITE_URL=https://your-website.com
17
+
18
+ # Memory Optimization: Remove pipeline dependencies after completion
19
+ # Saves ~350-500 MB RAM - Useful for hybrid deployments on budget VPS
20
+ CLEANUP_PIPELINE_DEPS=false
21
+
22
+ # ============================================
23
+ # LLM Configuration - Cloud Mode
24
+ # ============================================
25
+ LLM_RUN_ENV=cloud
26
+ # Choose your preferred cloud LLM (via LiteLLM)
27
+ LLM_MODEL=cerebras/llama3.1-8b
28
+ # Alternative models:
29
+ # LLM_MODEL=gemini/gemini-1.5-flash
30
+
31
+ # ============================================
32
+ # LLM API Keys (Set at least one)
33
+ # ============================================
34
+ # Cerebras (Fast, free tier available)
35
+ CEREBRAS_API_KEY=your_cerebras_api_key_here
36
+
37
+ # Google Gemini (Good for graph extraction)
38
+ GEMINI_API_KEY=your_gemini_api_key_here
39
+
40
+ # Nebius (Alternative provider)
41
+ NEBIUS_API_KEY=your_nebius_api_key_here
42
+
43
+ # ============================================
44
+ # Vector Database - Local Milvus
45
+ # ============================================
46
+ VECTOR_DB_TYPE=local
47
+ # Local database files stored in workspace
48
+
49
+ # ============================================
50
+ # Graph Database - Neo4j (Cloud or Local)
51
+ # ============================================
52
+ # Option 1: Neo4j Aura Cloud (Recommended)
53
+ NEO4J_URI=neo4j+s://your-instance.databases.neo4j.io
54
+ NEO4J_USERNAME=neo4j
55
+ NEO4J_PASSWORD=your_neo4j_password_here
56
+ NEO4J_DATABASE=neo4j
57
+
58
+ # Option 2: Local Neo4j
59
+ # NEO4J_URI=bolt://localhost:7687
60
+ # NEO4J_USERNAME=neo4j
61
+ # NEO4J_PASSWORD=your_local_password
62
+ # NEO4J_DATABASE=neo4j
63
+
64
+ # ============================================
65
+ # Application Settings
66
+ # ============================================
67
+ # Choose app type: flask_graph, chainlit_graph, flask, chainlit
68
+ APP_TYPE=flask_graph
69
+
70
+ # ============================================
71
+ # Port Configuration
72
+ # ============================================
73
+ # Flask Applications
74
+ FLASK_VECTOR_PORT=8081 # app_flask.py (vector-only RAG)
75
+ FLASK_GRAPH_PORT=8080 # app_flask_graph.py (GraphRAG - default)
76
+
77
+ # Chainlit Applications
78
+ CHAINLIT_VECTOR_PORT=8082 # app_chainlit.py
79
+ CHAINLIT_GRAPH_PORT=8083 # app_chainlit_graph.py
80
+
81
+ # Docker & External Services
82
+ DOCKER_PORT=8080 # External Docker exposed port (host side)
83
+ DOCKER_APP_PORT=8080 # Internal container port (container side, set to match your APP_TYPE)
84
+ OLLAMA_PORT=11434 # Ollama server port (not used in hybrid mode)
85
+
86
+ # Workspace directory
87
+ # For native execution: use relative path 'workspace'
88
+ # For Docker: use absolute path '/allycat/workspace'
89
+ WORKSPACE_DIR=/allycat/workspace
90
+
91
+ # ============================================
92
+ # Website Crawling Configuration
93
+ # ============================================
94
+ WEBSITE_URL=https://example.com
95
+ CRAWL_MAX_DOWNLOADS=100
96
+ CRAWL_MAX_DEPTH=3
97
+ WAITTIME_BETWEEN_REQUESTS=0.1
98
+
99
+ # ============================================
100
+ # Embedding Model Configuration
101
+ # ============================================
102
+ EMBEDDING_MODEL=ibm-granite/granite-embedding-30m-english
103
+ EMBEDDING_LENGTH=384
104
+ HF_ENDPOINT=https://hf-mirror.com
105
+
106
+ # ============================================
107
+ # Chunking Configuration
108
+ # ============================================
109
+ CHUNK_SIZE=512
110
+ CHUNK_OVERLAP=20
111
+
112
+ # ============================================
113
+ # Graph Extraction Configuration
114
+ # ============================================
115
+ GRAPH_MIN_ENTITIES=5
116
+ GRAPH_MAX_ENTITIES=15
117
+ GRAPH_MIN_RELATIONSHIPS=3
118
+ GRAPH_MAX_RELATIONSHIPS=8
119
+ GRAPH_MIN_CONFIDENCE=0.8
120
+ GRAPH_MAX_CONTENT_CHARS=12000
121
+ GRAPH_SENTENCE_BOUNDARY_RATIO=0.7
122
+
123
+ # ============================================
124
+ # UI Settings
125
+ # ============================================
126
+ UI_STARTER_PROMPTS=What is this website? | What are upcoming events? | Who are the partners?
.env.local.sample ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ============================================
2
+ # AllyCAT GraphRAG - Local Configuration
3
+ # ============================================
4
+ # This configuration uses local services for all components
5
+ # Recommended for development, testing, and offline deployments
6
+ # Docker image size: ~4+ GB (includes Ollama)
7
+
8
+ # ============================================
9
+ # Pipeline Automation (Docker Only)
10
+ # ============================================
11
+ # Set to 'true' to automatically run the complete pipeline on container startup
12
+ # This will: crawl → process → save to vector DB → process graph → save to graph DB
13
+ # For local development, typically set to 'false' to run steps manually
14
+ AUTO_RUN_PIPELINE=false
15
+
16
+ # Website to crawl (required if AUTO_RUN_PIPELINE=true)
17
+ WEBSITE_URL=https://your-website.com
18
+
19
+ # Memory Optimization: Remove pipeline dependencies after completion
20
+ # Saves ~350-500 MB RAM (less critical for local development)
21
+ # Set to true if running in resource-constrained environments
22
+ CLEANUP_PIPELINE_DEPS=false
23
+
24
+ # ============================================
25
+ # LLM Configuration - Local Ollama
26
+ # ============================================
27
+ LLM_RUN_ENV=local_ollama
28
+ LLM_MODEL=ollama/gemma3:1b
29
+ # Model to download and use
30
+ OLLAMA_MODEL=gemma3:1b
31
+ # Alternative local models:
32
+ # OLLAMA_MODEL=qwen2.5:1.5b
33
+ # OLLAMA_MODEL=llama3.2:1b
34
+
35
+ # ============================================
36
+ # Vector Database - Local Milvus
37
+ # ============================================
38
+ VECTOR_DB_TYPE=local
39
+ # Local database files stored in workspace
40
+
41
+ # ============================================
42
+ # Graph Database - Local or Cloud Neo4j
43
+ # ============================================
44
+ # Option 1: Local Neo4j (requires separate Neo4j installation)
45
+ NEO4J_URI=bolt://localhost:7687
46
+ NEO4J_USERNAME=neo4j
47
+ NEO4J_PASSWORD=your_local_password
48
+ NEO4J_DATABASE=neo4j
49
+
50
+ # Option 2: Or use Neo4j Aura Cloud even in local mode
51
+ # NEO4J_URI=neo4j+s://your-instance.databases.neo4j.io
52
+ # NEO4J_USERNAME=neo4j
53
+ # NEO4J_PASSWORD=your_neo4j_password_here
54
+ # NEO4J_DATABASE=neo4j
55
+
56
+ # ============================================
57
+ # Graph Extraction LLM (Cloud API recommended)
58
+ # ============================================
59
+ # Even in local mode, graph extraction benefits from cloud LLMs
60
+ # Set at least one for graph building:
61
+ GEMINI_API_KEY=your_gemini_api_key_here
62
+ CEREBRAS_API_KEY=your_cerebras_api_key_here
63
+
64
+ # ============================================
65
+ # Application Settings
66
+ # ============================================
67
+ APP_TYPE=flask_graph
68
+
69
+ # ============================================
70
+ # Port Configuration
71
+ # ============================================
72
+ # Flask Applications
73
+ FLASK_VECTOR_PORT=8081 # app_flask.py (vector-only RAG)
74
+ FLASK_GRAPH_PORT=8080 # app_flask_graph.py (GraphRAG - default)
75
+
76
+ # Chainlit Applications
77
+ CHAINLIT_VECTOR_PORT=8082 # app_chainlit.py
78
+ CHAINLIT_GRAPH_PORT=8083 # app_chainlit_graph.py
79
+
80
+ # Docker & External Services
81
+ DOCKER_PORT=8080 # External Docker exposed port (host side)
82
+ DOCKER_APP_PORT=8080 # Internal container port (container side, set to match your APP_TYPE)
83
+ OLLAMA_PORT=11434 # Ollama server port (for local LLM)
84
+
85
+ # Workspace directory
86
+ # For native execution: use relative path 'workspace'
87
+ # For Docker: use absolute path '/allycat/workspace'
88
+ WORKSPACE_DIR=/allycat/workspace
89
+
90
+ # ============================================
91
+ # Website Crawling Configuration
92
+ # ============================================
93
+ WEBSITE_URL=https://example.com
94
+ CRAWL_MAX_DOWNLOADS=100
95
+ CRAWL_MAX_DEPTH=3
96
+ WAITTIME_BETWEEN_REQUESTS=0.1
97
+
98
+ # ============================================
99
+ # Embedding Model Configuration
100
+ # ============================================
101
+ EMBEDDING_MODEL=ibm-granite/granite-embedding-30m-english
102
+ EMBEDDING_LENGTH=384
103
+ HF_ENDPOINT=https://hf-mirror.com
104
+
105
+ # ============================================
106
+ # Chunking Configuration
107
+ # ============================================
108
+ CHUNK_SIZE=512
109
+ CHUNK_OVERLAP=20
110
+
111
+ # ============================================
112
+ # Graph Extraction Configuration
113
+ # ============================================
114
+ GRAPH_MIN_ENTITIES=5
115
+ GRAPH_MAX_ENTITIES=15
116
+ GRAPH_MIN_RELATIONSHIPS=3
117
+ GRAPH_MAX_RELATIONSHIPS=8
118
+ GRAPH_MIN_CONFIDENCE=0.8
119
+ GRAPH_MAX_CONTENT_CHARS=12000
120
+ GRAPH_SENTENCE_BOUNDARY_RATIO=0.7
121
+
122
+ # ============================================
123
+ # UI Settings
124
+ # ============================================
125
+ UI_STARTER_PROMPTS=What is this website? | What are upcoming events? | Who are the partners?
.gitignore ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ workspace/
2
+ *.out*
3
+ .directory
4
+ venv*
5
+ .vscode
6
+ tmp
7
+
8
+ # Ignore actual .env file but allow sample files
9
+ .env
10
+ !.env.*.sample
11
+ !env.sample.txt
12
+
13
+ *.db
14
+ *.db.lock
15
+
16
+ ## profiling outputs
17
+ *.speed
18
+
19
+ __pycache__
20
+
21
+ chainlit.md
22
+
23
+ node_modules/
24
+
25
+ logs/
26
+ logs/*
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.11
1_crawl_site.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ import argparse
4
+ import shutil
5
+ import requests
6
+ from bs4 import BeautifulSoup
7
+ from urllib.parse import urljoin, urlparse
8
+ import time
9
+ import logging
10
+ import os
11
+ import re
12
+ import mimetypes
13
+ from my_config import MY_CONFIG
14
+
15
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class WebScraper:
20
+ def __init__(self, url, max_downloads, depth):
21
+ self.url = url
22
+ self.max_downloads = max_downloads
23
+ self.depth = depth
24
+ self.visited_urls = set()
25
+ self.downloaded_base_urls = set() # Track base URLs without fragments
26
+ self.downloaded_count = 0
27
+
28
+ def scrape_page(self, url, current_depth=0):
29
+ try:
30
+ # For downloading, we need to remove fragment since HTTP requests ignore them
31
+ parsed_url = urlparse(url)
32
+ download_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
33
+ if parsed_url.query:
34
+ download_url += f"?{parsed_url.query}"
35
+
36
+ # Check if we've already downloaded this base URL content
37
+ if download_url in self.downloaded_base_urls:
38
+ # If we have a fragment and haven't visited this exact URL, save with fragment name
39
+ if parsed_url.fragment and url not in self.visited_urls:
40
+ # Get the cached response if we have one, otherwise make a request
41
+ response = requests.get(download_url, timeout=10)
42
+ response.raise_for_status()
43
+
44
+ filename = self.url_to_filename(url, response)
45
+ filepath = os.path.join(MY_CONFIG.CRAWL_DIR, filename)
46
+
47
+ # Handle binary files vs text files based on mime type
48
+ mime_type = response.headers.get('Content-Type', '').lower()
49
+ is_text = mime_type.startswith('text/') or 'html' in mime_type or 'xml' in mime_type
50
+
51
+ if is_text:
52
+ with open(filepath, 'w', encoding='utf-8') as f:
53
+ f.write(response.text)
54
+ else:
55
+ with open(filepath, 'wb') as f:
56
+ f.write(response.content)
57
+
58
+ self.downloaded_count += 1
59
+ logger.info(f"Saved {filepath} with fragment ({self.downloaded_count}/{self.max_downloads})")
60
+
61
+ return [] # Don't re-parse links from same content
62
+ else:
63
+ logger.info(f"Skipping already downloaded URL: {download_url}")
64
+ return []
65
+
66
+ response = requests.get(download_url, timeout=10)
67
+ response.raise_for_status()
68
+
69
+ # Track that we've downloaded this base URL
70
+ self.downloaded_base_urls.add(download_url)
71
+
72
+ # Save file using original URL (with fragment) for unique filename
73
+ filename = self.url_to_filename(url, response)
74
+ filepath = os.path.join(MY_CONFIG.CRAWL_DIR, filename)
75
+
76
+ # Handle binary files vs text files based on mime type
77
+ mime_type = response.headers.get('Content-Type', '').lower()
78
+ is_text = mime_type.startswith('text/') or 'html' in mime_type or 'xml' in mime_type
79
+
80
+ if is_text:
81
+ with open(filepath, 'w', encoding='utf-8') as f:
82
+ f.write(response.text)
83
+ else:
84
+ with open(filepath, 'wb') as f:
85
+ f.write(response.content)
86
+
87
+ self.downloaded_count += 1
88
+ logger.info(f"Saved {filepath} ({self.downloaded_count}/{self.max_downloads})")
89
+
90
+ # Parse for links if not at max depth
91
+ links = []
92
+ if current_depth < self.depth:
93
+ soup = BeautifulSoup(response.content, 'html.parser')
94
+ base_domain = urlparse(self.url).netloc
95
+ for link in soup.find_all('a', href=True):
96
+ full_url = urljoin(url, link.get('href'))
97
+ if urlparse(full_url).netloc == base_domain:
98
+ links.append(full_url)
99
+
100
+ return links
101
+
102
+ except Exception as e:
103
+ logger.error(f"Error scraping {url}: {str(e)}")
104
+ return []
105
+
106
+ def url_to_filename(self, url, response):
107
+ # Keep domain and path, strip protocol, use __ for directory separators
108
+ parsed = urlparse(url)
109
+ domain = parsed.netloc
110
+ path = parsed.path
111
+ fragment = parsed.fragment
112
+
113
+ if not path or path == '/':
114
+ filename = f"{domain}__index"
115
+ else:
116
+ filename = f"{domain}{path.replace('/', '__')}"
117
+
118
+ # Add fragment (anchor) to filename if present
119
+ if fragment:
120
+ filename = f"{filename}__{fragment}"
121
+
122
+ filename = re.sub(r'[^\w\-_.]', '_', filename)
123
+
124
+ mime_type = response.headers.get('Content-Type')
125
+ if mime_type:
126
+ inferred_extension = mimetypes.guess_extension(mime_type.split(';')[0].strip())
127
+ else:
128
+ inferred_extension = '.html'
129
+
130
+ current_ext = os.path.splitext(filename)[1]
131
+ ext = os.path.splitext(filename)[1]
132
+ # print ('--- filename:', filename) # Debugging line
133
+ # print ('--- mimetype:', mime_type) # Debugging line
134
+ # print ('--- inferred_extension', inferred_extension) # Debugging line
135
+ # print ('--- current_ext:', current_ext) # Debugging line
136
+
137
+ # Only append .html if no extension exists
138
+ if not filename.endswith(inferred_extension):
139
+ filename = f"{filename}.html"
140
+
141
+ # print ('--- returning filename:', filename) # Debugging line
142
+ return filename
143
+
144
+
145
+ def scrape(self):
146
+ shutil.rmtree(MY_CONFIG.CRAWL_DIR, ignore_errors=True)
147
+ os.makedirs(MY_CONFIG.CRAWL_DIR, exist_ok=True)
148
+ logger.info(f"✅ Cleared crawl directory: {MY_CONFIG.CRAWL_DIR}")
149
+
150
+ logger.info(f"⚙ Starting scrape of {self.url}, max downloads: {self.max_downloads}, depth: {self.depth}")
151
+
152
+
153
+ urls_to_visit = [(self.url, 0)] # (url, depth)
154
+
155
+ while urls_to_visit and self.downloaded_count < self.max_downloads:
156
+ current_url, current_depth = urls_to_visit.pop(0)
157
+
158
+ if current_url in self.visited_urls:
159
+ continue
160
+
161
+ self.visited_urls.add(current_url)
162
+
163
+ links = self.scrape_page(current_url, current_depth)
164
+
165
+ # Add new URLs if not at max depth
166
+ if current_depth < self.depth:
167
+ for link in links:
168
+ if link not in self.visited_urls:
169
+ urls_to_visit.append((link, current_depth + 1))
170
+
171
+ time.sleep(MY_CONFIG.WAITTIME_BETWEEN_REQUESTS)
172
+
173
+
174
+ def main():
175
+ parser = argparse.ArgumentParser(description="Web scraper")
176
+ parser.add_argument("--url", type=str, default=MY_CONFIG.WEBSITE_URL, help=f"URL to scrape (default: {MY_CONFIG.WEBSITE_URL})")
177
+ parser.add_argument("--max-downloads", type=int, default=MY_CONFIG.CRAWL_MAX_DOWNLOADS, help=f"Maximum number of files to download (default: {MY_CONFIG.CRAWL_MAX_DOWNLOADS})")
178
+ parser.add_argument("--depth", type=int, default=MY_CONFIG.CRAWL_MAX_DEPTH, help=f"Maximum depth to crawl (default: {MY_CONFIG.CRAWL_MAX_DEPTH})")
179
+
180
+ args = parser.parse_args()
181
+
182
+ scraper = WebScraper(args.url, args.max_downloads, args.depth)
183
+ scraper.scrape()
184
+
185
+ logger.info(f"✅ Scraping completed. Downloaded {scraper.downloaded_count} files to '{MY_CONFIG.CRAWL_DIR}' directory.")
186
+
187
+ if __name__ == "__main__":
188
+ main()
2_process_files.ipynb ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Processing HTML Files\n",
8
+ "\n",
9
+ "We will be using **docling**\n",
10
+ "\n",
11
+ "References\n",
12
+ "- [docling](https://github.com/DS4SD/docling)"
13
+ ]
14
+ },
15
+ {
16
+ "cell_type": "markdown",
17
+ "metadata": {},
18
+ "source": [
19
+ "## Step-1: Data\n",
20
+ "\n",
21
+ "We will process data that is downloaded using [1_crawl_site.ipynb](1_crawl_site.ipynb).\n",
22
+ "\n",
23
+ "We have a couple of crawled HTML files in `input` directory. "
24
+ ]
25
+ },
26
+ {
27
+ "cell_type": "markdown",
28
+ "metadata": {},
29
+ "source": [
30
+ "## Step-2: Configuration"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": 1,
36
+ "metadata": {},
37
+ "outputs": [],
38
+ "source": [
39
+ "## All config is defined here\n",
40
+ "from my_config import MY_CONFIG"
41
+ ]
42
+ },
43
+ {
44
+ "cell_type": "code",
45
+ "execution_count": 2,
46
+ "metadata": {},
47
+ "outputs": [
48
+ {
49
+ "name": "stdout",
50
+ "output_type": "stream",
51
+ "text": [
52
+ "✅ Cleared processed data directory : workspace/processed\n"
53
+ ]
54
+ }
55
+ ],
56
+ "source": [
57
+ "import os, sys\n",
58
+ "import shutil\n",
59
+ "\n",
60
+ "shutil.rmtree(MY_CONFIG.PROCESSED_DATA_DIR, ignore_errors=True)\n",
61
+ "shutil.os.makedirs(MY_CONFIG.PROCESSED_DATA_DIR, exist_ok=True)\n",
62
+ "print (f\"✅ Cleared processed data directory : {MY_CONFIG.PROCESSED_DATA_DIR}\")"
63
+ ]
64
+ },
65
+ {
66
+ "cell_type": "markdown",
67
+ "metadata": {},
68
+ "source": [
69
+ "## Step-3: Convet FILES --> MD\n",
70
+ "\n",
71
+ "Process HTML documents and extract the text in markdown format"
72
+ ]
73
+ },
74
+ {
75
+ "cell_type": "code",
76
+ "execution_count": null,
77
+ "metadata": {},
78
+ "outputs": [],
79
+ "source": [
80
+ "%%time \n",
81
+ "\n",
82
+ "import os\n",
83
+ "import sys\n",
84
+ "from pathlib import Path\n",
85
+ "from docling.document_converter import DocumentConverter\n",
86
+ "\n",
87
+ "converter = DocumentConverter(format_options={\"preserve_links\": True})\n",
88
+ "\n",
89
+ "input_path = Path(MY_CONFIG.CRAWL_DIR)\n",
90
+ "input_files = list(input_path.glob('*.html')) + list(input_path.glob('*.htm')) + list(input_path.glob('*.pdf'))\n",
91
+ "print (f\"Found {len(input_files)} files to convert\")\n",
92
+ "\n",
93
+ "files_processed = 0\n",
94
+ "errors = 0\n",
95
+ "for input_file in input_files:\n",
96
+ " try:\n",
97
+ " result = converter.convert(input_file)\n",
98
+ " markdown_content = result.document.export_to_markdown()\n",
99
+ " \n",
100
+ " md_file_name = os.path.join(MY_CONFIG.PROCESSED_DATA_DIR, f\"{input_file.stem}.md\")\n",
101
+ " with open(md_file_name, \"w\", encoding=\"utf-8\") as md_file:\n",
102
+ " md_file.write(markdown_content)\n",
103
+ " \n",
104
+ " print (f\"Converted '{input_file}' --> '{md_file_name}'\")\n",
105
+ " files_processed += 1\n",
106
+ " except Exception as e:\n",
107
+ " errors += 1\n",
108
+ " print (f\"Error processing {input_file}: {e}\")\n",
109
+ "\n",
110
+ "print (f\"✅ Processed {files_processed} files. Errors: {errors}\")"
111
+ ]
112
+ }
113
+ ],
114
+ "metadata": {
115
+ "kernelspec": {
116
+ "display_name": "allycat-1",
117
+ "language": "python",
118
+ "name": "python3"
119
+ },
120
+ "language_info": {
121
+ "codemirror_mode": {
122
+ "name": "ipython",
123
+ "version": 3
124
+ },
125
+ "file_extension": ".py",
126
+ "mimetype": "text/x-python",
127
+ "name": "python",
128
+ "nbconvert_exporter": "python",
129
+ "pygments_lexer": "ipython3",
130
+ "version": "3.11.12"
131
+ }
132
+ },
133
+ "nbformat": 4,
134
+ "nbformat_minor": 2
135
+ }
2_process_files.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, sys
2
+ import shutil
3
+ from pathlib import Path
4
+ from docling.document_converter import DocumentConverter
5
+ import html2text
6
+ import logging
7
+ import hashlib
8
+ from my_config import MY_CONFIG
9
+
10
+ logging.basicConfig(level=logging.WARNING, format='%(asctime)s - %(levelname)s - %(message)s')
11
+ logger = logging.getLogger(__name__)
12
+ logger.setLevel(logging.INFO)
13
+
14
+ def cleanup_duplicate_markdown_files(processed_dir):
15
+ """
16
+ Remove duplicate markdown files based on content hash.
17
+ Keeps the first file encountered for each unique content.
18
+ """
19
+ processed_path = Path(processed_dir)
20
+ md_files = list(processed_path.glob('*.md'))
21
+
22
+ if not md_files:
23
+ logger.info("No markdown files found for deduplication")
24
+ return 0
25
+
26
+ content_hashes = {}
27
+ duplicates_removed = 0
28
+
29
+ for md_file in md_files:
30
+ try:
31
+ with open(md_file, 'r', encoding='utf-8') as f:
32
+ content = f.read()
33
+
34
+ content_hash = hashlib.md5(content.encode('utf-8')).hexdigest()
35
+
36
+ if content_hash in content_hashes:
37
+ os.remove(md_file)
38
+ duplicates_removed += 1
39
+ logger.info(f"Removed duplicate: {md_file} (same content as {content_hashes[content_hash]})")
40
+ else:
41
+ content_hashes[content_hash] = md_file
42
+
43
+ except Exception as e:
44
+ logger.warning(f"Error processing {md_file} for deduplication: {e}")
45
+
46
+ logger.info(f"✅ Deduplication complete. Removed {duplicates_removed} duplicate files")
47
+ return duplicates_removed
48
+ ## --- end of cleanup_duplicate_markdown_files ---
49
+
50
+ def process_files(crawl_dir, processed_dir):
51
+ """
52
+ Process all files in the crawl directory and convert them to markdown.
53
+ Uses html2text for HTML/HTM files and docling for PDFs and other documents.
54
+
55
+ Args:
56
+ crawl_dir (str): Directory containing files to process
57
+ processed_dir (str): Directory to save processed markdown files
58
+ """
59
+
60
+ input_path = Path(crawl_dir)
61
+ input_files = list(input_path.glob('*'))
62
+ logger.info (f"Found {len(input_files)} files to process in {input_path}")
63
+
64
+ shutil.rmtree(processed_dir, ignore_errors=True)
65
+ shutil.os.makedirs(processed_dir, exist_ok=True)
66
+ logger.info (f"✅ Cleared processed data directory : {processed_dir}")
67
+
68
+ # Initialize converters
69
+ docling_converter = DocumentConverter(format_options={"preserve_links": True})
70
+ html_converter = html2text.HTML2Text()
71
+ html_converter.ignore_links = False
72
+ html_converter.ignore_images = False
73
+
74
+ files_processed = 0
75
+ errors = 0
76
+ file_type_stats = {}
77
+
78
+ for input_file in input_files:
79
+ file_ext = input_file.suffix.lower()
80
+ markdown_content = None
81
+
82
+ try:
83
+ # Process HTML/HTM files with html2text
84
+ if file_ext in ['.html', '.htm']:
85
+ with open(input_file, 'r', encoding='utf-8', errors='ignore') as f:
86
+ html_content = f.read()
87
+ markdown_content = html_converter.handle(html_content)
88
+ logger.debug(f"Converted HTML '{input_file}' with html2text")
89
+
90
+ # Process TXT files directly
91
+ elif file_ext == '.txt':
92
+ with open(input_file, 'r', encoding='utf-8', errors='ignore') as f:
93
+ markdown_content = f.read()
94
+ logger.debug(f"Processed TXT '{input_file}' directly")
95
+
96
+ # Process PDF and other documents with docling
97
+ else:
98
+ result = docling_converter.convert(input_file)
99
+ markdown_content = result.document.export_to_markdown()
100
+ logger.debug(f"Converted '{input_file}' with docling")
101
+
102
+ # Save markdown file
103
+ if markdown_content:
104
+ md_file_name = os.path.join(processed_dir, f"{input_file.stem}.md")
105
+ with open(md_file_name, "w", encoding="utf-8") as md_file:
106
+ md_file.write(markdown_content)
107
+
108
+ files_processed += 1
109
+ file_type_stats[file_ext] = file_type_stats.get(file_ext, 0) + 1
110
+
111
+ except Exception as e:
112
+ errors += 1
113
+ logger.warning(f"Error processing {input_file}: {e}")
114
+
115
+ logger.info (f"✅ Processed {files_processed} files. Errors: {errors}")
116
+
117
+ # Print file type statistics in compact dictionary format
118
+ if file_type_stats:
119
+ logger.info(f"📊 File type statistics: {dict(sorted(file_type_stats.items()))}")
120
+
121
+ return files_processed, errors, file_type_stats
122
+ ## --- end of process_files ---
123
+
124
+ def main():
125
+ """
126
+ Main function to run the file processing pipeline.
127
+ """
128
+ logger.info("🚀 Starting file processing pipeline")
129
+
130
+ try:
131
+ files_processed, errors, file_type_stats = process_files(MY_CONFIG.CRAWL_DIR, MY_CONFIG.PROCESSED_DATA_DIR)
132
+ duplicates_removed = cleanup_duplicate_markdown_files(MY_CONFIG.PROCESSED_DATA_DIR)
133
+ logger.info(f"✅ Final summary: {files_processed} files processed, {errors} errors, {duplicates_removed} duplicates removed")
134
+ logger.info("✅ File processing pipeline completed successfully")
135
+ return 0
136
+ except Exception as e:
137
+ logger.error(f"❌ File processing pipeline failed: {e}")
138
+ return 1
139
+
140
+ if __name__ == "__main__":
141
+ sys.exit(main())
2b_process_graph_phase1.py ADDED
@@ -0,0 +1,881 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ GraphRAG Phase 1: LLM-based Entity and Relationship Extraction
3
+ Builds initial knowledge graph from markdown files using LLMs (Cerebras or Gemini)
4
+ """
5
+
6
+ import json
7
+ import logging
8
+ import os
9
+ import time
10
+ import uuid
11
+ from pathlib import Path
12
+ from typing import Any, Dict, List
13
+ from datetime import datetime
14
+ import orjson
15
+ from json_repair import repair_json
16
+ import google.generativeai as genai
17
+ import openai
18
+ from my_config import MY_CONFIG
19
+
20
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
21
+ logger = logging.getLogger(__name__)
22
+
23
+ class GraphBuilder:
24
+
25
+ def __init__(self, llm_provider="cerebras"):
26
+ self.llm_provider = llm_provider.lower()
27
+
28
+ # Global entity registry for deduplication across files
29
+ self.global_entity_registry = {}
30
+
31
+ # Initialize graph data structure
32
+ self.graph_data = {"nodes": [], "relationships": []}
33
+ self.processed_files = 0
34
+
35
+ # Initialize LLM API based on provider
36
+ if self.llm_provider == "cerebras":
37
+ if not MY_CONFIG.CEREBRAS_API_KEY:
38
+ raise ValueError("CEREBRAS_API_KEY environment variable not set. Get free key at: https://cloud.cerebras.ai/")
39
+
40
+ # Configure Cerebras client
41
+ self.cerebras_client = openai.OpenAI(
42
+ api_key=MY_CONFIG.CEREBRAS_API_KEY,
43
+ base_url="https://api.cerebras.ai/v1"
44
+ )
45
+ self.model_name = "llama-4-scout-17b-16e-instruct"
46
+ logger.info("🚀 Using Cerebras API")
47
+
48
+ elif self.llm_provider == "gemini":
49
+ if not MY_CONFIG.GEMINI_API_KEY:
50
+ raise ValueError("GEMINI_API_KEY environment variable not set. Get free key at: https://aistudio.google.com/")
51
+
52
+ # Configure Gemini with FREE tier
53
+ genai.configure(api_key=MY_CONFIG.GEMINI_API_KEY)
54
+ self.model_name = "gemini-1.5-flash"
55
+ self.gemini_model = genai.GenerativeModel(self.model_name)
56
+ logger.info("🆓 Using Google Gemini API,)")
57
+
58
+ else:
59
+ valid_providers = ["cerebras", "gemini"]
60
+ raise ValueError(f"Invalid provider '{llm_provider}'. Choose from: {valid_providers}")
61
+
62
+ # Configure extraction parameters
63
+ self.min_entities = int(os.getenv("GRAPH_MIN_ENTITIES", "5"))
64
+ self.max_entities = int(os.getenv("GRAPH_MAX_ENTITIES", "15"))
65
+ self.min_relationships = int(os.getenv("GRAPH_MIN_RELATIONSHIPS", "3"))
66
+ self.max_relationships = int(os.getenv("GRAPH_MAX_RELATIONSHIPS", "8"))
67
+ self.min_confidence = float(os.getenv("GRAPH_MIN_CONFIDENCE", "0.8"))
68
+ self.max_content_chars = int(os.getenv("GRAPH_MAX_CONTENT_CHARS", "12000"))
69
+ self.sentence_boundary_ratio = float(os.getenv("GRAPH_SENTENCE_BOUNDARY_RATIO", "0.7"))
70
+
71
+ logger.info(f"✅ Initialized {self.llm_provider.upper()} provider with model: {self.model_name}")
72
+ logger.info(f"Extraction config: {self.min_entities}-{self.max_entities} entities, {self.min_relationships}-{self.max_relationships} relationships, min confidence: {self.min_confidence}")
73
+ logger.info(f"Content processing: {self.max_content_chars} chars per chunk with overlap for FULL analysis")
74
+
75
+ # STEP 0: Clean Graph Data Folder
76
+ def clean_graph_folder(self, graph_dir: str = None):
77
+ if graph_dir is None:
78
+ graph_dir = "workspace/graph_data"
79
+ try:
80
+ graph_path = Path(graph_dir)
81
+ if graph_path.exists():
82
+ # Remove all files in the directory
83
+ for file_path in graph_path.glob("*"):
84
+ if file_path.is_file():
85
+ file_path.unlink()
86
+ logger.debug(f"Removed: {file_path.name}")
87
+ logger.info(f"Cleaned graph folder: {graph_dir}")
88
+ else:
89
+ # Create directory if it doesn't exist
90
+ graph_path.mkdir(parents=True, exist_ok=True)
91
+ logger.info(f"Created graph folder: {graph_dir}")
92
+ except Exception as e:
93
+ logger.warning(f"Failed to clean graph folder: {e}")
94
+
95
+ # STEP 1: Content Preprocessing and Chunking
96
+ def _preprocess_content(self, text: str, max_chars: int = None) -> str:
97
+ # Remove excessive whitespace but keep full content
98
+ text = ' '.join(text.split())
99
+ return text.strip()
100
+
101
+ def _chunk_content(self, text: str, chunk_size: int = None, overlap: int = 200) -> List[str]:
102
+ if chunk_size is None:
103
+ chunk_size = self.max_content_chars
104
+
105
+ # If content fits in one chunk, return as-is
106
+ if len(text) <= chunk_size:
107
+ return [text]
108
+
109
+ chunks = []
110
+ start = 0
111
+
112
+ while start < len(text):
113
+ # Calculate end position
114
+ end = start + chunk_size
115
+
116
+ if end >= len(text):
117
+ # Last chunk
118
+ chunks.append(text[start:])
119
+ break
120
+
121
+ # Try to find good break point (sentence boundary)
122
+ chunk_text = text[start:end]
123
+ last_period = chunk_text.rfind('.')
124
+ last_newline = chunk_text.rfind('\n')
125
+
126
+ # Use best break point
127
+ break_point = max(last_period, last_newline)
128
+ if break_point > chunk_size * 0.7: # Good break point
129
+ actual_end = start + break_point + 1
130
+ chunks.append(text[start:actual_end])
131
+ start = actual_end - overlap # Overlap for context
132
+ else:
133
+ # No good break point, use hard split
134
+ chunks.append(text[start:end])
135
+ start = end - overlap
136
+
137
+ return chunks
138
+
139
+ # STEP 2: LLM Prompt Generation
140
+ def get_entity_extraction_prompt(self) -> str:
141
+ return f"""You are a specialized knowledge graph extraction assistant. Your task is to analyze content and extract entities and relationships to build comprehensive knowledge graphs.
142
+
143
+ DYNAMIC EXTRACTION REQUIREMENTS:
144
+ - Extract {self.min_entities}-{self.max_entities} most important entities from the content
145
+ - Create {self.min_relationships}-{self.max_relationships} meaningful relationships between entities
146
+ - Confidence threshold: {self.min_confidence} (only include high-confidence extractions)
147
+ - Focus on extracting diverse entity types relevant to the content domain
148
+
149
+ CONSTITUTIONAL AI PRINCIPLES:
150
+ 1. Content-Adaptive: Determine entity types based on content analysis, not predefined categories
151
+ 2. Relationship-Rich: Focus on meaningful semantic relationships between entities
152
+ 3. Context-Aware: Consider document context and domain when extracting entities
153
+ 4. Quality-First: Prioritize extraction quality over quantity
154
+
155
+ ENTITY EXTRACTION GUIDELINES:
156
+ - Identify the most important concepts, terms, people, places, organizations, technologies, events
157
+ - Extract entities that would be valuable for knowledge graph queries
158
+ - Include both explicit entities (directly mentioned) and implicit entities (strongly implied)
159
+ - Assign appropriate types based on semantic analysis of the entity's role in the content
160
+
161
+ RELATIONSHIP EXTRACTION GUIDELINES:
162
+ - Create relationships that capture semantic meaning, not just co-occurrence
163
+ - Use descriptive relationship types that express the nature of the connection
164
+ - Include hierarchical, associative, and causal relationships where appropriate
165
+ - Ensure relationships are bidirectionally meaningful and contextually accurate
166
+
167
+ OUTPUT FORMAT (strict JSON):
168
+ {{
169
+ "entities": [
170
+ {{
171
+ "text": "Entity Name",
172
+ "type": "DynamicType",
173
+ "content": "Comprehensive description of the entity",
174
+ "confidence": 0.95
175
+ }}
176
+ ],
177
+ "relationships": [
178
+ {{
179
+ "startNode": "Entity Name 1",
180
+ "endNode": "Entity Name 2",
181
+ "type": "DESCRIPTIVE_RELATIONSHIP_TYPE",
182
+ "description": "Clear description of the relationship",
183
+ "evidence": "Direct evidence from text supporting this relationship",
184
+ "confidence": 0.90
185
+ }}
186
+ ]
187
+ }}
188
+
189
+ IMPORTANT: Respond with ONLY the JSON object. No explanations, no markdown formatting, no code blocks."""
190
+
191
+ # STEP 3: LLM Inference Methods
192
+ def _cerebras_inference(self, system_prompt: str, user_prompt: str) -> str:
193
+ try:
194
+ # Cerebras uses OpenAI-compatible chat format
195
+ response = self.cerebras_client.chat.completions.create(
196
+ model=self.model_name,
197
+ messages=[
198
+ {"role": "system", "content": system_prompt},
199
+ {"role": "user", "content": user_prompt}
200
+ ],
201
+ temperature=0.1,
202
+ max_tokens=2000
203
+ )
204
+
205
+ # Check for empty response
206
+ if not response or not response.choices or not response.choices[0].message.content:
207
+ raise ValueError("Empty response from Cerebras")
208
+
209
+ return response.choices[0].message.content.strip()
210
+
211
+ except Exception as e:
212
+ # Check for quota/rate limit exceeded errors
213
+ error_str = str(e).lower()
214
+ if "429" in str(e) and "quota" in error_str:
215
+ logger.error(f"🚫 QUOTA EXCEEDED: Cerebras API rate/quota limit reached - {e}")
216
+ raise Exception("QUOTA_EXCEEDED") from e
217
+ else:
218
+ logger.error(f"Error with Cerebras inference: {e}")
219
+ raise e
220
+
221
+ def _gemini_inference(self, system_prompt: str, user_prompt: str) -> str:
222
+ try:
223
+ combined_prompt = f"{system_prompt}\n\n{user_prompt}"
224
+ response = self.gemini_model.generate_content(combined_prompt)
225
+ if not response or not response.text:
226
+ raise ValueError("Empty response from Gemini")
227
+
228
+ return response.text.strip()
229
+
230
+ except Exception as e:
231
+ # Check for quota exceeded error
232
+ if "429" in str(e) and "quota" in str(e).lower():
233
+ logger.error(f"🚫 QUOTA EXCEEDED: Gemini API daily limit reached - {e}")
234
+ raise Exception("QUOTA_EXCEEDED") from e
235
+ else:
236
+ logger.error(f"Error with Gemini inference: {e}")
237
+ raise e
238
+
239
+ # STEP 4: JSON Parsing Pipeline
240
+ def _smart_json_parse(self, json_text: str) -> Dict[str, Any]:
241
+
242
+ cleaned_text = json_text.strip()
243
+
244
+ # Step 1: orjson
245
+ try:
246
+ result = orjson.loads(cleaned_text.encode('utf-8'))
247
+ logger.debug("✅ Step 1: orjson succeeded")
248
+ return result
249
+ except Exception as e:
250
+ logger.debug(f"❌ Step 1: orjson failed - {e}")
251
+
252
+ # Step 2: json-repair
253
+ try:
254
+ repaired = repair_json(cleaned_text)
255
+ result = orjson.loads(repaired.encode('utf-8'))
256
+ logger.debug("✅ Step 2: json-repair + orjson succeeded")
257
+ return result
258
+ except Exception as e:
259
+ logger.debug(f"❌ Step 2: json-repair failed - {e}")
260
+
261
+ # Step 3: standard json
262
+ try:
263
+ result = json.loads(cleaned_text)
264
+ logger.debug("✅ Step 3: standard json succeeded")
265
+ return result
266
+ except Exception as e:
267
+ logger.debug(f"❌ Step 3: standard json failed - {e}")
268
+
269
+ # Step 4: json-repair + standard json
270
+ try:
271
+ repaired = repair_json(cleaned_text)
272
+ result = json.loads(repaired)
273
+ logger.debug("✅ Step 4: json-repair + standard json succeeded")
274
+ return result
275
+ except Exception as e:
276
+ logger.debug(f"❌ Step 4: json-repair + standard json failed - {e}")
277
+
278
+ # Step 5: All failed - this will trigger save failed txt files
279
+ raise ValueError("All 4 JSON parsing steps failed")
280
+
281
+ # STEP 5: Response Parsing and Validation
282
+ def _parse_llm_extraction_response(self, llm_response: str, file_name: str) -> Dict[str, Any]:
283
+
284
+ # Clean up response first
285
+ cleaned_response = llm_response.strip()
286
+
287
+ # Remove markdown formatting
288
+ if "```json" in cleaned_response:
289
+ parts = cleaned_response.split("```json")
290
+ if len(parts) > 1:
291
+ json_part = parts[1].split("```")[0].strip()
292
+ cleaned_response = json_part
293
+ elif "```" in cleaned_response:
294
+ parts = cleaned_response.split("```")
295
+ if len(parts) >= 3:
296
+ cleaned_response = parts[1].strip()
297
+
298
+ # Use the 5-step JSON parsing pipeline
299
+ try:
300
+ extraction_data = self._smart_json_parse(cleaned_response)
301
+
302
+ # Validate complete format
303
+ if self._validate_complete_format(extraction_data):
304
+ return extraction_data
305
+ else:
306
+ self._save_failed_response(cleaned_response, file_name, "Format validation failed", "Missing required fields or empty values")
307
+ return None
308
+ except Exception as e:
309
+ logger.error(f"❌ All JSON parsing steps failed for file {file_name}: {str(e)}")
310
+ self._save_failed_response(cleaned_response, file_name, "All parsing steps failed", str(e))
311
+ return None
312
+
313
+ # STEP 6: Format Validation
314
+ def _validate_complete_format(self, extraction_data: Dict[str, Any]) -> bool:
315
+
316
+ if not isinstance(extraction_data, dict):
317
+ return False
318
+
319
+ if "entities" not in extraction_data or "relationships" not in extraction_data:
320
+ return False
321
+
322
+ entities = extraction_data.get("entities", [])
323
+ relationships = extraction_data.get("relationships", [])
324
+ if not isinstance(entities, list) or len(entities) == 0:
325
+ return False
326
+ for entity in entities:
327
+ if not isinstance(entity, dict):
328
+ return False
329
+
330
+ required_fields = ["text", "type", "content", "confidence"]
331
+ for field in required_fields:
332
+ if field not in entity:
333
+ return False
334
+ value = entity[field]
335
+ if value is None or value == "" or (isinstance(value, str) and not value.strip()):
336
+ return False
337
+
338
+ if not isinstance(entity["confidence"], (int, float)) or entity["confidence"] <= 0:
339
+ return False
340
+
341
+ if isinstance(relationships, list):
342
+ for rel in relationships:
343
+ if not isinstance(rel, dict):
344
+ return False
345
+
346
+ required_fields = ["startNode", "endNode", "type", "description", "evidence", "confidence"]
347
+ for field in required_fields:
348
+ if field not in rel:
349
+ return False
350
+ value = rel[field]
351
+ if value is None or value == "" or (isinstance(value, str) and not value.strip()):
352
+ return False
353
+
354
+ if not isinstance(rel["confidence"], (int, float)) or rel["confidence"] <= 0:
355
+ return False
356
+
357
+ return True
358
+
359
+ # STEP 7: Error Handling and Failed Response Logging
360
+ def _save_failed_response(self, llm_response: str, file_name: str, _json_error: str, _repair_error: str):
361
+ try:
362
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
363
+ output_dir = Path("workspace/graph_data")
364
+ output_dir.mkdir(parents=True, exist_ok=True)
365
+
366
+ with open(output_dir / "failed_responses.txt", 'a', encoding='utf-8') as f:
367
+ f.write(f"# Failed response from file: {file_name} at {timestamp}\n")
368
+ f.write(llm_response)
369
+ f.write("\n---\n")
370
+ f.flush()
371
+
372
+ except Exception as save_error:
373
+ logger.error(f"Failed to save failed response from {file_name}: {save_error}")
374
+
375
+ # STEP 8: Main Entity Extraction
376
+ def extract_entities_with_llm(self, content: str, file_name: str) -> Dict[str, Any]:
377
+ # Preprocess content
378
+ processed_content = self._preprocess_content(content)
379
+
380
+ # Split into chunks
381
+ chunks = self._chunk_content(processed_content)
382
+
383
+ logger.info(f"📄 Processing {file_name}: {len(processed_content)} chars in {len(chunks)} chunk(s)")
384
+
385
+ # Collect all entities and relationships from chunks
386
+ all_entities = []
387
+ all_relationships = []
388
+
389
+ for chunk_idx, chunk in enumerate(chunks):
390
+ logger.info(f"🔄 Processing chunk {chunk_idx + 1}/{len(chunks)} for {file_name}")
391
+
392
+ # Simple retry mechanism for empty content - just send to LLM again
393
+ max_retries = 3
394
+ for attempt in range(max_retries):
395
+ # Get the optimized prompt for entity extraction based on provider
396
+ system_prompt = self.get_entity_extraction_prompt()
397
+
398
+ # Create user prompt with chunk content
399
+ chunk_info = f" (chunk {chunk_idx + 1}/{len(chunks)})" if len(chunks) > 1 else ""
400
+ user_prompt = f"""
401
+ Analyze the following content from file "{file_name}"{chunk_info}:
402
+
403
+ ```
404
+ {chunk}
405
+ ```
406
+
407
+ Extract all relevant entities, concepts, and their relationships from this content.
408
+ """
409
+
410
+ # Call appropriate LLM API
411
+ try:
412
+ if self.llm_provider == "gemini":
413
+ llm_response = self._gemini_inference(system_prompt, user_prompt)
414
+ elif self.llm_provider == "cerebras":
415
+ llm_response = self._cerebras_inference(system_prompt, user_prompt)
416
+ else:
417
+ raise ValueError(f"Unsupported LLM provider: {self.llm_provider}")
418
+ except Exception as e:
419
+ if "QUOTA_EXCEEDED" in str(e):
420
+ logger.error(f"🚫 QUOTA EXCEEDED on file {file_name}, chunk {chunk_idx + 1} - stopping processing")
421
+ # Return partial results if we have any
422
+ return {
423
+ "entities": all_entities,
424
+ "relationships": all_relationships,
425
+ "file": file_name,
426
+ "structure": {"section": "partial_quota_exceeded"},
427
+ "chunks_processed": chunk_idx,
428
+ "total_content_length": len(processed_content),
429
+ "quota_exceeded": True
430
+ }
431
+ else:
432
+ raise e
433
+
434
+ # Parse the JSON response
435
+ result = self._parse_llm_extraction_response(llm_response, f"{file_name}_chunk_{chunk_idx}")
436
+ if result is not None or attempt == max_retries - 1:
437
+ if result is None:
438
+ logger.warning(f"❌ Chunk {chunk_idx + 1} of {file_name} failed all validation attempts, skipping")
439
+ break
440
+
441
+ # Chunk results to collections
442
+ chunk_entities = result.get("entities", [])
443
+ chunk_relationships = result.get("relationships", [])
444
+
445
+ # Add chunk identifier to entities for deduplication
446
+ for entity in chunk_entities:
447
+ entity["chunk_id"] = chunk_idx
448
+ entity["source_chunk"] = f"chunk_{chunk_idx}"
449
+
450
+ # Add chunk identifier to relationships
451
+ for rel in chunk_relationships:
452
+ rel["chunk_id"] = chunk_idx
453
+ rel["source_chunk"] = f"chunk_{chunk_idx}"
454
+
455
+ all_entities.extend(chunk_entities)
456
+ all_relationships.extend(chunk_relationships)
457
+
458
+ logger.info(f"✅ Chunk {chunk_idx + 1}: {len(chunk_entities)} entities, {len(chunk_relationships)} relationships")
459
+ break
460
+ else:
461
+ logger.info(f"Chunk {chunk_idx + 1} attempt {attempt + 1}/{max_retries}: Validation failed, retrying")
462
+
463
+ # Deduplicate entities across chunks (same entity name = same entity)
464
+ unique_entities = {}
465
+ for entity in all_entities:
466
+ entity_key = entity.get("text", "").lower().strip()
467
+ if entity_key and entity_key not in unique_entities:
468
+ unique_entities[entity_key] = entity
469
+ elif entity_key:
470
+ # Merge information from duplicate entities
471
+ existing = unique_entities[entity_key]
472
+ existing["confidence"] = max(existing.get("confidence", 0), entity.get("confidence", 0))
473
+ # Combine descriptions
474
+ existing_desc = existing.get("content", "")
475
+ new_desc = entity.get("content", "")
476
+ if new_desc and new_desc not in existing_desc:
477
+ existing["content"] = f"{existing_desc}; {new_desc}".strip("; ")
478
+
479
+ # Deduplicate relationships (same startNode+endNode+type = same relationship)
480
+ unique_relationships = {}
481
+ for rel in all_relationships:
482
+ rel_key = f"{rel.get('startNode', '').lower()}||{rel.get('endNode', '').lower()}||{rel.get('type', '').lower()}"
483
+ if rel_key and rel_key not in unique_relationships:
484
+ unique_relationships[rel_key] = rel
485
+ elif rel_key:
486
+ # Keep highest confidence relationship
487
+ existing = unique_relationships[rel_key]
488
+ if rel.get("confidence", 0) > existing.get("confidence", 0):
489
+ unique_relationships[rel_key] = rel
490
+
491
+ final_entities = list(unique_entities.values())
492
+ final_relationships = list(unique_relationships.values())
493
+
494
+ logger.info(f"Final results for {file_name}: {len(final_entities)} unique entities, {len(final_relationships)} unique relationships")
495
+
496
+ return {
497
+ "entities": final_entities,
498
+ "relationships": final_relationships,
499
+ "file": file_name,
500
+ "structure": {"section": "full_analysis"},
501
+ "chunks_processed": len(chunks),
502
+ "total_content_length": len(processed_content)
503
+ }
504
+
505
+
506
+
507
+ # STEP 9: Single File Processing
508
+ def process_md_file(self, md_file_path: str) -> Dict[str, Any]:
509
+ logger.info(f"Processing: {md_file_path}")
510
+
511
+ try:
512
+ # Read file content
513
+ with open(md_file_path, 'r', encoding='utf-8') as f:
514
+ content = f.read()
515
+
516
+ file_name = os.path.basename(md_file_path)
517
+
518
+ # Extract entities and relationships using LLM-only approach
519
+ llm_data = self.extract_entities_with_llm(content, file_name)
520
+
521
+ # Use LLM data - create nodes and relationships from validated data
522
+ entities_added = 0
523
+ relationships_added = 0
524
+
525
+ # Check if quota was exceeded during extraction
526
+ quota_exceeded = llm_data.get("quota_exceeded", False)
527
+ if quota_exceeded:
528
+ return {
529
+ "file": file_name,
530
+ "status": "quota_exceeded",
531
+ "entities_extracted": len(llm_data.get("entities", [])),
532
+ "unique_entities_added": 0,
533
+ "relationships_generated": 0,
534
+ "processed_at": datetime.now().isoformat(),
535
+ "error": "API quota exceeded during processing"
536
+ }
537
+
538
+ # Process entities from LLM
539
+ for entity in llm_data.get("entities", []):
540
+ entity_text = entity["text"]
541
+ semantic_key = entity_text.lower().strip()
542
+
543
+ # Add to global registry if new
544
+ if semantic_key not in self.global_entity_registry:
545
+ # Use LLM data directly
546
+ entity["id"] = str(uuid.uuid4())
547
+ entity["source_file"] = file_name
548
+
549
+ self.global_entity_registry[semantic_key] = entity
550
+ self.graph_data["nodes"].append(entity)
551
+ entities_added += 1
552
+
553
+ # Process relationships from LLM
554
+ for rel in llm_data.get("relationships", []):
555
+ # Apply confidence threshold filtering
556
+ rel_confidence = rel.get("confidence", 0.0)
557
+ if rel_confidence < self.min_confidence:
558
+ continue # Skip low-confidence relationships
559
+
560
+ start_text = rel["startNode"].lower().strip()
561
+ end_text = rel["endNode"].lower().strip()
562
+
563
+ # Only create if both entities exist
564
+ if start_text in self.global_entity_registry and end_text in self.global_entity_registry:
565
+ # Use original relationship type without sanitization
566
+ original_type = rel["type"]
567
+
568
+ # Create clean relationship with only Neo4j fields
569
+ clean_rel = {
570
+ "id": str(uuid.uuid4()),
571
+ "startNode": self.global_entity_registry[start_text]["id"],
572
+ "endNode": self.global_entity_registry[end_text]["id"],
573
+ "type": original_type, # Use original type preserving semantic meaning
574
+ "description": rel.get("description", ""),
575
+ "evidence": rel.get("evidence", ""),
576
+ "confidence": rel_confidence,
577
+ "chunk_id": rel.get("chunk_id", 0),
578
+ "source_chunk": rel.get("source_chunk", ""),
579
+ "source_file": file_name
580
+ }
581
+
582
+ self.graph_data["relationships"].append(clean_rel)
583
+ relationships_added += 1
584
+
585
+ result = {
586
+ "file": file_name,
587
+ "status": "success",
588
+ "entities_extracted": len(llm_data.get("entities", [])),
589
+ "unique_entities_added": entities_added,
590
+ "relationships_generated": relationships_added,
591
+ "processed_at": datetime.now().isoformat()
592
+ }
593
+
594
+ self.processed_files += 1
595
+ logger.info(f"✅ Processed {file_name}: {entities_added} new entities, {relationships_added} relationships")
596
+ return result
597
+
598
+ except Exception as e:
599
+ logger.error(f"❌ Error processing {md_file_path}: {e}")
600
+ return {
601
+ "file": os.path.basename(md_file_path),
602
+ "status": "error",
603
+ "error": str(e),
604
+ "processed_at": datetime.now().isoformat()
605
+ }
606
+
607
+ # STEP 10: Batch File Processing
608
+ def process_all_md_files(self, input_dir: str = None, output_path: str = None) -> Dict[str, Any]:
609
+ if input_dir is None:
610
+ input_dir = "workspace/processed"
611
+ if output_path is None:
612
+ output_path = os.path.join("workspace/graph_data", "graph-data-initial.json")
613
+
614
+ # Clean the graph folder before starting fresh processing
615
+ graph_dir = os.path.dirname(output_path)
616
+ self.clean_graph_folder(graph_dir)
617
+
618
+ input_path = Path(input_dir)
619
+ md_files = list(input_path.glob("**/*.md")) # Include subdirectories
620
+
621
+ # Ensure output directory exists
622
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
623
+
624
+ if not md_files:
625
+ logger.warning(f"No markdown files found in {input_dir}")
626
+ return {"status": "no_files", "message": "No markdown files found"}
627
+
628
+ logger.info(f"Found {len(md_files)} markdown files to process")
629
+
630
+ # Reset data structures for a clean batch processing
631
+ self.graph_data = {"nodes": [], "relationships": []}
632
+ self.global_entity_registry = {} # Reset global registry
633
+ self.processed_files = 0
634
+
635
+ logger.info(f"🚀 Starting document processing with Neo4j format output ({self.llm_provider.upper()})...")
636
+
637
+ # Process files with progress tracking
638
+ results = []
639
+ processed_successfully = []
640
+ failed_files = []
641
+ quota_exceeded_files = []
642
+ start_time = time.time()
643
+
644
+ for i, md_file in enumerate(md_files, 1):
645
+ file_start_time = time.time()
646
+ logger.info(f"Processing file {i}/{len(md_files)}: {md_file.name}")
647
+
648
+ # Track registry size before processing
649
+ initial_registry_size = len(self.global_entity_registry)
650
+ initial_relationship_count = len(self.graph_data["relationships"])
651
+
652
+ # Process the file
653
+ result = self.process_md_file(str(md_file))
654
+ results.append(result)
655
+
656
+ # Track file status for detailed logging
657
+ file_status = result.get("status", "unknown")
658
+ if file_status == "success":
659
+ processed_successfully.append(md_file.name)
660
+ elif file_status == "quota_exceeded":
661
+ quota_exceeded_files.append(md_file.name)
662
+ logger.warning(f"🚫 QUOTA EXCEEDED - Stopping batch processing at file {i}/{len(md_files)}")
663
+ break # Stop processing when quota exceeded
664
+ else:
665
+ failed_files.append((md_file.name, result.get("error", "Unknown error")))
666
+
667
+ # Calculate processing metrics
668
+ file_time = time.time() - file_start_time
669
+ new_entities = len(self.global_entity_registry) - initial_registry_size
670
+ new_relationships = len(self.graph_data["relationships"]) - initial_relationship_count
671
+
672
+ # Show detailed progress information
673
+ logger.info(f" File processed in {file_time:.2f}s: {new_entities} new entities, {new_relationships} relationships")
674
+
675
+ # Show batch progress at regular intervals
676
+ if i % 5 == 0 or i == len(md_files):
677
+ successful_so_far = sum(1 for r in results if r.get("status") == "success")
678
+ elapsed = time.time() - start_time
679
+ avg_time = elapsed / i
680
+ remaining = avg_time * (len(md_files) - i)
681
+
682
+ logger.info(f"Progress: {i}/{len(md_files)} files ({successful_so_far} successful)")
683
+ logger.info(f" Current stats: {len(self.global_entity_registry)} unique entities, {len(self.graph_data['relationships'])} relationships")
684
+ logger.info(f"Time elapsed: {elapsed:.1f}s (avg {avg_time:.1f}s per file, ~{remaining:.1f}s remaining)")
685
+
686
+ # Generate comprehensive summary with detailed tracking
687
+ elapsed = time.time() - start_time
688
+ successful = len(processed_successfully)
689
+ quota_exceeded = len(quota_exceeded_files)
690
+ failed = len(failed_files)
691
+ unique_entities = len(self.global_entity_registry)
692
+
693
+ # Save detailed processing lists
694
+ self._save_processing_logs(processed_successfully, quota_exceeded_files, failed_files, output_path)
695
+
696
+ # Count entity types
697
+ entity_types = {}
698
+ for entity_info in self.global_entity_registry.values():
699
+ entity_type = entity_info["type"]
700
+ entity_types[entity_type] = entity_types.get(entity_type, 0) + 1
701
+
702
+ # Count relationship types
703
+ relationship_types = {}
704
+ for rel in self.graph_data["relationships"]:
705
+ rel_type = rel["type"]
706
+ relationship_types[rel_type] = relationship_types.get(rel_type, 0) + 1
707
+
708
+ summary = {
709
+ "status": "completed",
710
+ "total_files": len(md_files),
711
+ "successful": successful,
712
+ "quota_exceeded": quota_exceeded,
713
+ "failed": failed,
714
+ "unique_entities": unique_entities,
715
+ "total_relationships": len(self.graph_data["relationships"]),
716
+ "entity_types": entity_types,
717
+ "relationship_types": relationship_types,
718
+ "processing_time_seconds": elapsed,
719
+ "average_time_per_file": elapsed / len(md_files) if md_files else 0,
720
+ "model": self.model_name,
721
+ "llm_provider": self.llm_provider,
722
+ "processed_at": datetime.now().isoformat()
723
+ }
724
+
725
+ logger.info(f"✅ Processing complete in {elapsed:.1f}s: {successful}/{len(md_files)} files successful")
726
+ if quota_exceeded > 0:
727
+ logger.warning(f"🚫 {quota_exceeded} files hit quota limit")
728
+ if failed > 0:
729
+ logger.error(f"❌ {failed} files failed with errors")
730
+ logger.info(f"Final stats: {unique_entities} unique entities, {len(self.graph_data['relationships'])} relationships")
731
+
732
+ # Log entity and relationship type breakdown
733
+ logger.info("Entity types:")
734
+ for entity_type, count in sorted(entity_types.items(), key=lambda x: x[1], reverse=True)[:10]:
735
+ logger.info(f" - {entity_type}: {count}")
736
+
737
+ logger.info("Relationship types:")
738
+ for rel_type, count in sorted(relationship_types.items(), key=lambda x: x[1], reverse=True)[:10]:
739
+ logger.info(f" - {rel_type}: {count}")
740
+
741
+ return summary
742
+
743
+ # STEP 10.5: Processing Logs Tracking
744
+ def _save_processing_logs(self, successful_files: List[str], quota_exceeded_files: List[str], failed_files: List[tuple], output_path: str):
745
+ try:
746
+ output_dir = Path(output_path).parent
747
+
748
+ # Save successfully processed files
749
+ with open(output_dir / "processed_successfully.txt", 'w', encoding='utf-8') as f:
750
+ f.write(f"# Successfully Processed Files ({len(successful_files)} total)\n")
751
+ f.write(f"# Generated: {datetime.now().isoformat()}\n\n")
752
+ for file_name in successful_files:
753
+ f.write(f"{file_name}\n")
754
+
755
+ # Save quota exceeded files
756
+ if quota_exceeded_files:
757
+ with open(output_dir / "quota_exceeded_files.txt", 'w', encoding='utf-8') as f:
758
+ f.write(f"# Files That Hit Quota Limit ({len(quota_exceeded_files)} total)\n")
759
+ f.write(f"# Generated: {datetime.now().isoformat()}\n\n")
760
+ for file_name in quota_exceeded_files:
761
+ f.write(f"{file_name}\n")
762
+
763
+ # Save failed files with errors
764
+ if failed_files:
765
+ with open(output_dir / "failed_files.txt", 'w', encoding='utf-8') as f:
766
+ f.write(f"# Files That Failed Processing ({len(failed_files)} total)\n")
767
+ f.write(f"# Generated: {datetime.now().isoformat()}\n\n")
768
+ for file_name, error in failed_files:
769
+ f.write(f"{file_name}: {error}\n")
770
+
771
+ logger.info(f"📋 Processing logs saved to {output_dir}")
772
+
773
+ except Exception as e:
774
+ logger.error(f"❌ Failed to save processing logs: {e}")
775
+
776
+ # STEP 11: Graph Data Output
777
+ def save_graph_data(self, output_path: str = None) -> bool:
778
+ if output_path is None:
779
+ output_path = os.path.join("workspace/graph_data", "graph-data-initial.json")
780
+ try:
781
+ # Ensure output directory exists
782
+ output_dir = Path(output_path).parent
783
+ output_dir.mkdir(parents=True, exist_ok=True)
784
+
785
+ # Compile final data from global entity registry
786
+ final_nodes = []
787
+
788
+ for semantic_key, entity_info in self.global_entity_registry.items():
789
+ entity_id = entity_info["id"]
790
+
791
+ # Create Neo4j node
792
+ node = {
793
+ "id": entity_id,
794
+ "elementId": entity_id,
795
+ "labels": [entity_info["type"]],
796
+ "properties": {
797
+ "name": entity_info["text"],
798
+ "content": entity_info.get("content", ""),
799
+ "source": entity_info.get("source_file", ""),
800
+ "confidence": entity_info["confidence"],
801
+ "created_date": datetime.now().strftime("%Y-%m-%d"),
802
+ "extraction_method": self.llm_provider
803
+ }
804
+ }
805
+ final_nodes.append(node)
806
+
807
+ # Use relationships
808
+ final_relationships = self.graph_data["relationships"]
809
+
810
+ # Prepare final graph data
811
+ final_graph = {
812
+ "nodes": final_nodes,
813
+ "relationships": final_relationships,
814
+ "metadata": {
815
+ "node_count": len(final_nodes),
816
+ "relationship_count": len(final_relationships),
817
+ "generated_at": datetime.now().isoformat(),
818
+ "generator": "Allycat GraphBuilder",
819
+ "llm_provider": self.llm_provider,
820
+ "model": self.model_name,
821
+ "format_version": "neo4j-2025"
822
+ }
823
+ }
824
+
825
+ # Save final graph data
826
+ with open(output_path, 'w', encoding='utf-8') as f:
827
+ json.dump(final_graph, f, indent=2, ensure_ascii=False)
828
+
829
+ # Calculate final output size
830
+ output_size = os.path.getsize(output_path)
831
+ output_size_mb = output_size / (1024 * 1024)
832
+
833
+ logger.info(f"✅ Neo4j graph data saved to {output_path} ({output_size_mb:.2f} MB)")
834
+ logger.info(f"Final stats: {len(final_nodes)} nodes, {len(final_relationships)} relationships")
835
+ return True
836
+
837
+ except Exception as e:
838
+ logger.error(f"❌ Error saving graph data: {e}")
839
+ return False
840
+
841
+ # STEP 12: Main Entry Point
842
+ def main():
843
+ """Main function to run the content analysis pipeline."""
844
+ logger.info(" Starting Content Analysis Pipeline (Cloud-based APIs)")
845
+
846
+ # Choose LLM provider from environment or default to cerebras
847
+ llm_provider = os.getenv("GRAPH_LLM_PROVIDER", "cerebras").lower()
848
+ logger.info(f" Using LLM provider: {llm_provider.upper()}")
849
+
850
+ # Validate provider choice
851
+ valid_providers = ["cerebras", "gemini"]
852
+ if llm_provider not in valid_providers:
853
+ logger.warning(f"⚠️ Invalid provider '{llm_provider}'. Using 'cerebras' (default)")
854
+ llm_provider = "cerebras"
855
+
856
+ try:
857
+ analyzer = GraphBuilder(llm_provider=llm_provider)
858
+
859
+ # Normal processing
860
+ summary = analyzer.process_all_md_files()
861
+
862
+ if summary["status"] == "no_files":
863
+ logger.warning("⚠️ No files to process")
864
+ return 1
865
+
866
+ if analyzer.save_graph_data():
867
+ logger.info("✅ Content Analysis completed successfully!")
868
+ logger.info(f" Results: {summary['successful']}/{summary['total_files']} files processed")
869
+ logger.info(f"Graph: {summary['unique_entities']} nodes, {summary['total_relationships']} relationships")
870
+ logger.info(f"Model used: {analyzer.model_name} via {llm_provider.upper()}")
871
+ return 0
872
+ else:
873
+ logger.error("❌ Failed to save graph data")
874
+ return 1
875
+
876
+ except Exception as e:
877
+ logger.error(f"❌ Pipeline failed: {e}")
878
+ return 1
879
+
880
+ if __name__ == "__main__":
881
+ exit(main())
2b_process_graph_phase2.py ADDED
@@ -0,0 +1,427 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Phase 2: Community Detection using Leiden Algorithm
3
+ Loads graph-data-initial.json, runs community detection, saves graph-data-phase-2.json
4
+ """
5
+
6
+ import json
7
+ import logging
8
+ import os
9
+ import time
10
+ from pathlib import Path
11
+ from typing import Dict, Any
12
+ from collections import defaultdict
13
+
14
+ import networkx as nx
15
+ import igraph as ig
16
+ import leidenalg
17
+ import traceback
18
+
19
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class GraphBuilderPhase2:
24
+ """Phase 2: Detect communities using graph algorithms (NetworkX + Leiden)"""
25
+
26
+ def __init__(self):
27
+ """Initialize Phase 2 processor"""
28
+ self.graph_data = None
29
+ self.nx_graph = None
30
+ self.community_result = None
31
+ self.community_stats = None
32
+ self.centrality_metrics = None
33
+
34
+ # Configuration from environment or defaults
35
+ self.min_community_size = int(os.getenv("GRAPH_MIN_COMMUNITY_SIZE", "5"))
36
+ self.leiden_resolution = float(os.getenv("GRAPH_LEIDEN_RESOLUTION", "1.0"))
37
+ self.leiden_iterations = int(os.getenv("GRAPH_LEIDEN_ITERATIONS", "-1")) # -1 = until convergence
38
+ self.leiden_seed = int(os.getenv("GRAPH_LEIDEN_SEED", "42"))
39
+
40
+ logger.info("✅ Phase 2 Initialized: Community Detection")
41
+ logger.info(f" - Min Community Size: {self.min_community_size}")
42
+ logger.info(f" - Leiden Resolution: {self.leiden_resolution}")
43
+
44
+ # STEP 1: Load Graph Data from Phase 1
45
+ def load_graph_data(self, input_path: str = None) -> bool:
46
+ """Load graph data from the specified JSON file."""
47
+ if input_path is None:
48
+ input_path = "workspace/graph_data/graph-data-initial.json"
49
+
50
+ logger.info(f"Loading graph data from {input_path}...")
51
+
52
+ try:
53
+ input_file = Path(input_path)
54
+ if not input_file.exists():
55
+ logger.error(f"❌ Input file not found: {input_path}")
56
+ logger.warning(" Please run Phase 1 (2b_process_graph_phase1.py) to generate the graph data.")
57
+ return False
58
+
59
+ with open(input_file, 'r', encoding='utf-8') as f:
60
+ self.graph_data = json.load(f)
61
+
62
+ node_count = len(self.graph_data.get("nodes", []))
63
+ rel_count = len(self.graph_data.get("relationships", []))
64
+
65
+ logger.info(f" - Found {node_count} nodes and {rel_count} relationships")
66
+
67
+ if node_count == 0:
68
+ logger.error("❌ Graph data is empty. Cannot proceed.")
69
+ return False
70
+
71
+ return True
72
+
73
+ except Exception as e:
74
+ logger.error(f"❌ Error loading graph data: {e}")
75
+ return False
76
+
77
+ # STEP 2: Build NetworkX Graph
78
+ def _build_networkx_graph(self) -> nx.Graph:
79
+ """Convert graph_data JSON to NetworkX graph for analysis"""
80
+ logger.info("Building NetworkX graph from JSON data...")
81
+
82
+ G = nx.Graph()
83
+
84
+ # Add nodes with attributes
85
+ for node in self.graph_data["nodes"]:
86
+ node_id = node["id"]
87
+ properties = node.get("properties", {})
88
+
89
+ G.add_node(
90
+ node_id,
91
+ name=properties.get("name", ""),
92
+ type=node.get("labels", ["Unknown"])[0],
93
+ description=properties.get("content", ""),
94
+ source=properties.get("source", ""),
95
+ confidence=properties.get("confidence", 0.0)
96
+ )
97
+
98
+ # Add edges with attributes
99
+ for rel in self.graph_data["relationships"]:
100
+ start_node = rel.get("startNode")
101
+ end_node = rel.get("endNode")
102
+
103
+ # Only add edge if both nodes exist
104
+ if start_node in G.nodes() and end_node in G.nodes():
105
+ G.add_edge(
106
+ start_node,
107
+ end_node,
108
+ type=rel.get("type", "RELATED_TO"),
109
+ evidence=rel.get("evidence", ""),
110
+ confidence=rel.get("confidence", 0.0)
111
+ )
112
+
113
+ logger.info(f"✅ Built NetworkX graph: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")
114
+
115
+ # Log basic graph statistics
116
+ if G.number_of_nodes() > 0:
117
+ density = nx.density(G)
118
+ logger.info(f"📊 Graph density: {density:.4f}")
119
+
120
+ if G.number_of_edges() > 0:
121
+ avg_degree = sum(dict(G.degree()).values()) / G.number_of_nodes()
122
+ logger.info(f"📊 Average degree: {avg_degree:.2f}")
123
+
124
+ return G
125
+
126
+ # STEP 3: Convert to igraph for Leiden
127
+ def _convert_to_igraph(self, G: nx.Graph) -> ig.Graph:
128
+ """Convert NetworkX graph to igraph for Leiden algorithm"""
129
+ logger.info("🔄 Converting to igraph format for Leiden algorithm...")
130
+
131
+ # Create mapping from node IDs to indices
132
+ node_list = list(G.nodes())
133
+ node_to_idx = {node: idx for idx, node in enumerate(node_list)}
134
+
135
+ # Create edge list with indices
136
+ edges = [(node_to_idx[u], node_to_idx[v]) for u, v in G.edges()]
137
+
138
+ # Create igraph
139
+ ig_graph = ig.Graph(n=len(node_list), edges=edges, directed=False)
140
+
141
+ # Add node attributes
142
+ ig_graph.vs["name"] = [G.nodes[node].get("name", "") for node in node_list]
143
+ ig_graph.vs["node_id"] = node_list
144
+
145
+ logger.info(f"✅ Converted to igraph: {ig_graph.vcount()} vertices, {ig_graph.ecount()} edges")
146
+
147
+ return ig_graph
148
+
149
+ # STEP 4: Run Leiden Algorithm
150
+ def _run_leiden_algorithm(self, ig_graph: ig.Graph) -> Dict[str, Any]:
151
+ """Run Leiden algorithm for community detection"""
152
+ logger.info("🔍 Running Leiden community detection algorithm...")
153
+ logger.info(f"Parameters: resolution={self.leiden_resolution}, iterations={self.leiden_iterations}, seed={self.leiden_seed}")
154
+
155
+ start_time = time.time()
156
+
157
+ try:
158
+ # Run Leiden algorithm
159
+ partition = leidenalg.find_partition(
160
+ ig_graph,
161
+ leidenalg.ModularityVertexPartition,
162
+ n_iterations=self.leiden_iterations,
163
+ seed=self.leiden_seed
164
+ )
165
+
166
+ # Extract community assignments
167
+ community_assignments = {}
168
+ for idx, community_id in enumerate(partition.membership):
169
+ node_id = ig_graph.vs[idx]["node_id"]
170
+ community_assignments[node_id] = community_id
171
+
172
+ # Calculate statistics
173
+ num_communities = len(set(partition.membership))
174
+ modularity = partition.modularity
175
+
176
+ elapsed = time.time() - start_time
177
+
178
+ logger.info(f"✅ Leiden algorithm completed in {elapsed:.2f}s")
179
+ logger.info(f"Detected {num_communities} communities")
180
+ logger.info(f"Modularity score: {modularity:.4f}")
181
+
182
+ return {
183
+ "assignments": community_assignments,
184
+ "num_communities": num_communities,
185
+ "modularity": modularity,
186
+ "algorithm": "Leiden",
187
+ "execution_time": elapsed
188
+ }
189
+
190
+ except Exception as e:
191
+ logger.error(f"❌ Leiden algorithm failed: {e}")
192
+ raise e
193
+
194
+ # STEP 5: Calculate Community Statistics
195
+ def _calculate_community_stats(self, G: nx.Graph, community_assignments: Dict[str, int]) -> Dict[int, Dict]:
196
+ """Calculate statistics for each community"""
197
+ logger.info("Calculating community statistics...")
198
+
199
+ # Group nodes by community
200
+ communities = defaultdict(list)
201
+ for node_id, comm_id in community_assignments.items():
202
+ communities[comm_id].append(node_id)
203
+
204
+ # Calculate stats for each community
205
+ stats = {}
206
+ for comm_id, node_ids in communities.items():
207
+ # Skip very small communities if configured
208
+ if len(node_ids) < self.min_community_size:
209
+ logger.debug(f"Skipping small community {comm_id} with {len(node_ids)} members")
210
+ continue
211
+
212
+ subgraph = G.subgraph(node_ids)
213
+
214
+ stats[comm_id] = {
215
+ "member_count": len(node_ids),
216
+ "internal_edges": subgraph.number_of_edges(),
217
+ "density": nx.density(subgraph) if len(node_ids) > 1 else 0.0,
218
+ "avg_degree": sum(dict(subgraph.degree()).values()) / len(node_ids) if len(node_ids) > 0 else 0.0,
219
+ "member_ids": node_ids[:20] # Store top 20 for summary generation
220
+ }
221
+
222
+ logger.info(f"Calculated statistics for {len(stats)} communities (filtered by min_size={self.min_community_size})")
223
+
224
+ # Log top 5 largest communities
225
+ sorted_communities = sorted(stats.items(), key=lambda x: x[1]["member_count"], reverse=True)
226
+ logger.info("Top 5 largest communities:")
227
+ for comm_id, stat in sorted_communities[:5]:
228
+ logger.info(f" Community {comm_id}: {stat['member_count']} members, {stat['internal_edges']} edges, density={stat['density']:.3f}")
229
+
230
+ return stats
231
+
232
+ # STEP 6: Calculate Centrality Metrics
233
+ def _calculate_centrality_metrics(self, G: nx.Graph) -> Dict[str, Dict]:
234
+ """Calculate centrality metrics for all nodes"""
235
+ logger.info("Calculating node centrality metrics...")
236
+
237
+ start_time = time.time()
238
+
239
+ # Degree centrality (fast, always calculate)
240
+ degree_centrality = nx.degree_centrality(G)
241
+
242
+ # Betweenness centrality (expensive, only for smaller graphs)
243
+ if G.number_of_nodes() < 5000:
244
+ logger.info(" Calculating betweenness centrality...")
245
+ betweenness_centrality = nx.betweenness_centrality(G, k=min(100, G.number_of_nodes()))
246
+ else:
247
+ logger.info(" Skipping betweenness centrality (graph too large)")
248
+ betweenness_centrality = {node: 0.0 for node in G.nodes()}
249
+
250
+ # Closeness centrality (expensive, only for smaller graphs)
251
+ if G.number_of_nodes() < 5000:
252
+ logger.info("Calculating closeness centrality...")
253
+ closeness_centrality = nx.closeness_centrality(G)
254
+ else:
255
+ logger.info(" Skipping closeness centrality (graph too large)")
256
+ closeness_centrality = {node: 0.0 for node in G.nodes()}
257
+
258
+ # Combine metrics
259
+ centrality_metrics = {}
260
+ for node in G.nodes():
261
+ centrality_metrics[node] = {
262
+ "degree": G.degree(node),
263
+ "degree_centrality": degree_centrality.get(node, 0.0),
264
+ "betweenness_centrality": betweenness_centrality.get(node, 0.0),
265
+ "closeness_centrality": closeness_centrality.get(node, 0.0)
266
+ }
267
+
268
+ elapsed = time.time() - start_time
269
+ logger.info(f"✅ Calculated centrality for {len(centrality_metrics)} nodes in {elapsed:.2f}s")
270
+
271
+ return centrality_metrics
272
+
273
+ # STEP 7: Add Community Data to Nodes
274
+ def _add_community_data_to_nodes(self, community_assignments: Dict[str, int], centrality_metrics: Dict[str, Dict]) -> None:
275
+ """Add community_id and centrality metrics to node properties"""
276
+ logger.info("Adding community assignments and centrality to nodes...")
277
+
278
+ nodes_updated = 0
279
+
280
+ for node in self.graph_data["nodes"]:
281
+ node_id = node["id"]
282
+
283
+ # Add community_id
284
+ if node_id in community_assignments:
285
+ node["properties"]["community_id"] = f"comm-{community_assignments[node_id]}"
286
+ nodes_updated += 1
287
+
288
+ # Add centrality metrics
289
+ if node_id in centrality_metrics:
290
+ metrics = centrality_metrics[node_id]
291
+ node["properties"]["degree"] = metrics["degree"]
292
+ node["properties"]["degree_centrality"] = round(metrics["degree_centrality"], 4)
293
+ node["properties"]["betweenness_centrality"] = round(metrics["betweenness_centrality"], 4)
294
+ node["properties"]["closeness_centrality"] = round(metrics["closeness_centrality"], 4)
295
+
296
+ logger.info(f"✅ Updated {nodes_updated} nodes with community and centrality data")
297
+
298
+ # STEP 8: Main Processing Entry Point
299
+ def run_community_detection(self, input_path: str = None, output_path: str = None) -> bool:
300
+ """Main entry point for Phase 2"""
301
+ if output_path is None:
302
+ output_path = "workspace/graph_data/graph-data-phase-2.json"
303
+
304
+ logger.info("🚀 Starting Phase 2: Community Detection")
305
+ logger.info("=" * 60)
306
+
307
+ start_time = time.time()
308
+
309
+ # Step 1: Load Phase 1 output
310
+ if not self.load_graph_data(input_path):
311
+ return False
312
+
313
+ # Step 2: Build NetworkX graph
314
+ self.nx_graph = self._build_networkx_graph()
315
+
316
+ if self.nx_graph.number_of_nodes() == 0:
317
+ logger.error("❌ Cannot run community detection on empty graph")
318
+ return False
319
+
320
+ # Step 3: Convert to igraph
321
+ ig_graph = self._convert_to_igraph(self.nx_graph)
322
+
323
+ # Step 4: Run Leiden algorithm
324
+ self.community_result = self._run_leiden_algorithm(ig_graph)
325
+
326
+ # Step 5: Calculate community statistics
327
+ self.community_stats = self._calculate_community_stats(
328
+ self.nx_graph,
329
+ self.community_result["assignments"]
330
+ )
331
+
332
+ # Step 6: Calculate centrality metrics
333
+ self.centrality_metrics = self._calculate_centrality_metrics(self.nx_graph)
334
+
335
+ # Step 7: Add community data to nodes
336
+ self._add_community_data_to_nodes(
337
+ self.community_result["assignments"],
338
+ self.centrality_metrics
339
+ )
340
+
341
+ # Step 8: Update metadata
342
+ self.graph_data["metadata"]["phase"] = "community_detection"
343
+ self.graph_data["metadata"]["community_detection"] = {
344
+ "algorithm": "Leiden",
345
+ "num_communities": self.community_result["num_communities"],
346
+ "modularity_score": round(self.community_result["modularity"], 4),
347
+ "execution_time_seconds": round(self.community_result["execution_time"], 2),
348
+ "min_community_size": self.min_community_size,
349
+ "resolution": self.leiden_resolution
350
+ }
351
+
352
+ # Step 9: Add community statistics to output
353
+ self.graph_data["community_stats"] = self.community_stats
354
+
355
+ # Step 10: Save Phase 2 output
356
+ if self._save_phase2_output(output_path):
357
+ elapsed = time.time() - start_time
358
+ logger.info("=" * 60)
359
+ logger.info(f"✅ Phase 2 completed successfully in {elapsed:.2f}s")
360
+ logger.info(f"Final stats:")
361
+ logger.info(f" - Communities detected: {self.community_result['num_communities']}")
362
+ logger.info(f" - Modularity score: {self.community_result['modularity']:.4f}")
363
+ logger.info(f" - Nodes with community assignments: {len(self.community_result['assignments'])}")
364
+ logger.info(f" - Output saved to: {output_path}")
365
+ return True
366
+ else:
367
+ return False
368
+
369
+ # STEP 9: Save Phase 2 Output
370
+ def _save_phase2_output(self, output_path: str) -> bool:
371
+ """Save graph-data-phase-2.json"""
372
+ try:
373
+ # Ensure output directory exists
374
+ output_dir = Path(output_path).parent
375
+ output_dir.mkdir(parents=True, exist_ok=True)
376
+
377
+ # Save Phase 2 output
378
+ with open(output_path, 'w', encoding='utf-8') as f:
379
+ json.dump(self.graph_data, f, indent=2, ensure_ascii=False)
380
+
381
+ # Calculate file size
382
+ output_size = os.path.getsize(output_path)
383
+ output_size_mb = output_size / (1024 * 1024)
384
+
385
+ logger.info(f"Saved Phase 2 output: {output_path} ({output_size_mb:.2f} MB)")
386
+
387
+ return True
388
+
389
+ except Exception as e:
390
+ logger.error(f"❌ Error saving Phase 2 output: {e}")
391
+ return False
392
+
393
+
394
+ # STEP 10: Main Entry Point
395
+ def main():
396
+ """Main function to run Phase 2: Community Detection"""
397
+ logger.info("🚀 GraphRAG Phase 2: Community Detection")
398
+ logger.info(" Input: graph-data-initial.json (from Phase 1)")
399
+ logger.info(" Output: graph-data-phase-2.json")
400
+ logger.info("")
401
+
402
+ try:
403
+ # Initialize Phase 2 processor
404
+ processor = GraphBuilderPhase2()
405
+
406
+ # Run community detection
407
+ success = processor.run_community_detection()
408
+
409
+ if success:
410
+ logger.info("")
411
+ logger.info("✅ Phase 2 completed successfully!")
412
+ logger.info("Next step: Run Phase 3 (2b_process_graph_phase3.py) for community summarization")
413
+ return 0
414
+ else:
415
+ logger.error("")
416
+ logger.error("❌ Phase 2 failed")
417
+ logger.error(" Please check the logs above for details")
418
+ return 1
419
+
420
+ except Exception as e:
421
+ logger.error(f"❌ Phase 2 pipeline failed: {e}")
422
+ logger.error(traceback.format_exc())
423
+ return 1
424
+
425
+
426
+ if __name__ == "__main__":
427
+ exit(main())
2b_process_graph_phase3.py ADDED
@@ -0,0 +1,1096 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Phase 3: Community Summarization using LLM
3
+ Loads graph-data-phase-2.json, generates summaries, saves graph-data-final.json
4
+ """
5
+
6
+ import json
7
+ import logging
8
+ import os
9
+ import time
10
+ from pathlib import Path
11
+ from typing import Dict, Any, List
12
+ from datetime import datetime
13
+ from collections import defaultdict
14
+
15
+ import networkx as nx
16
+ import openai
17
+ import google.generativeai as genai
18
+
19
+ # JSON parsing libraries (same as Phase 1)
20
+ import orjson
21
+ from json_repair import repair_json
22
+
23
+ from my_config import MY_CONFIG
24
+
25
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ class GraphBuilderPhase3:
30
+ """Phase 3: Generate community summaries using LLM"""
31
+
32
+ def __init__(self, llm_provider: str = "cerebras"):
33
+ """Initialize Phase 3 processor"""
34
+ self.llm_provider = llm_provider.lower()
35
+ self.graph_data = None
36
+ self.nx_graph = None
37
+ self.community_assignments = {}
38
+ self.community_stats = {}
39
+
40
+ # Initialize LLM API based on provider
41
+ if self.llm_provider == "cerebras":
42
+ if not MY_CONFIG.CEREBRAS_API_KEY:
43
+ raise ValueError("CEREBRAS_API_KEY not set")
44
+
45
+ self.cerebras_client = openai.OpenAI(
46
+ api_key=MY_CONFIG.CEREBRAS_API_KEY,
47
+ base_url="https://api.cerebras.ai/v1"
48
+ )
49
+ self.model_name = "llama-4-scout-17b-16e-instruct"
50
+ logger.info("🚀 Using Cerebras API")
51
+
52
+ elif self.llm_provider == "gemini":
53
+ if not MY_CONFIG.GEMINI_API_KEY:
54
+ raise ValueError("GEMINI_API_KEY not set")
55
+
56
+ genai.configure(api_key=MY_CONFIG.GEMINI_API_KEY)
57
+ self.model_name = "gemini-1.5-flash"
58
+ self.gemini_model = genai.GenerativeModel(self.model_name)
59
+ logger.info("🆓 Using Google Gemini API")
60
+
61
+ else:
62
+ raise ValueError(f"Invalid provider '{llm_provider}'. Choose: cerebras, gemini")
63
+
64
+ # Initialize embedding model for DRIFT search metadata
65
+ try:
66
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
67
+ self.embedding_model = HuggingFaceEmbedding(
68
+ model_name=MY_CONFIG.EMBEDDING_MODEL
69
+ )
70
+ logger.info(f"🔍 Initialized embedding model: {MY_CONFIG.EMBEDDING_MODEL}")
71
+ except Exception as e:
72
+ logger.warning(f"⚠️ Embedding model initialization failed: {e}")
73
+ self.embedding_model = None
74
+
75
+ logger.info("✅ Phase 3 initialized: Community Summarization")
76
+ logger.info(f"📊 LLM Provider: {self.llm_provider.upper()}, Model: {self.model_name}")
77
+
78
+ # STEP 1: Load Phase 2 Output
79
+ def load_graph_data(self, input_path: str = None) -> bool:
80
+ """Load graph-data-phase-2.json from Phase 2"""
81
+ if input_path is None:
82
+ input_path = "workspace/graph_data/graph-data-phase-2.json"
83
+
84
+ try:
85
+ input_file = Path(input_path)
86
+ if not input_file.exists():
87
+ logger.error(f"❌ Input file not found: {input_path}")
88
+ logger.error(" Please run Phase 2 (2b_process_graph_phase2.py) first")
89
+ return False
90
+
91
+ with open(input_file, 'r', encoding='utf-8') as f:
92
+ self.graph_data = json.load(f)
93
+
94
+ node_count = len(self.graph_data.get("nodes", []))
95
+ rel_count = len(self.graph_data.get("relationships", []))
96
+
97
+ # Verify Phase 2 was completed
98
+ if self.graph_data.get("metadata", {}).get("phase") != "community_detection":
99
+ logger.error("❌ Input file is not from Phase 2 (community_detection)")
100
+ return False
101
+
102
+ logger.info(f"📂 Loaded graph-data-phase-2.json: {node_count} nodes, {rel_count} relationships")
103
+
104
+ # Load community stats
105
+ self.community_stats = self.graph_data.get("community_stats", {})
106
+ num_communities = len(self.community_stats)
107
+ logger.info(f"📊 Found {num_communities} communities to summarize")
108
+
109
+ if num_communities == 0:
110
+ logger.error("❌ No communities found in Phase 2 output")
111
+ return False
112
+
113
+ return True
114
+
115
+ except Exception as e:
116
+ logger.error(f"❌ Error loading graph data: {e}")
117
+ return False
118
+
119
+ # STEP 2: Build NetworkX Graph
120
+ def _build_networkx_graph(self) -> nx.Graph:
121
+ """Rebuild NetworkX graph from JSON data"""
122
+ logger.info("🔨 Building NetworkX graph from JSON data...")
123
+
124
+ G = nx.Graph()
125
+
126
+ # Add nodes with attributes
127
+ for node in self.graph_data["nodes"]:
128
+ node_id = node["id"]
129
+ properties = node.get("properties", {})
130
+
131
+ G.add_node(
132
+ node_id,
133
+ name=properties.get("name", ""),
134
+ type=node.get("labels", ["Unknown"])[0],
135
+ description=properties.get("content", ""),
136
+ community_id=properties.get("community_id", ""),
137
+ degree_centrality=properties.get("degree_centrality", 0.0)
138
+ )
139
+
140
+ # Add edges
141
+ for rel in self.graph_data["relationships"]:
142
+ start_node = rel.get("startNode")
143
+ end_node = rel.get("endNode")
144
+
145
+ if start_node in G.nodes() and end_node in G.nodes():
146
+ G.add_edge(start_node, end_node)
147
+
148
+ logger.info(f"✅ Built NetworkX graph: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")
149
+
150
+ return G
151
+
152
+ # STEP 3: Extract Community Assignments
153
+ def _extract_community_assignments(self) -> Dict[str, int]:
154
+ """Extract community assignments from node properties"""
155
+ logger.info("📋 Extracting community assignments from nodes...")
156
+
157
+ assignments = {}
158
+
159
+ for node in self.graph_data["nodes"]:
160
+ node_id = node["id"]
161
+ comm_id_str = node.get("properties", {}).get("community_id", "")
162
+
163
+ if comm_id_str and comm_id_str.startswith("comm-"):
164
+ try:
165
+ comm_id = int(comm_id_str.replace("comm-", ""))
166
+ assignments[node_id] = comm_id
167
+ except ValueError:
168
+ logger.warning(f"Invalid community_id format: {comm_id_str}")
169
+
170
+ logger.info(f"✅ Extracted {len(assignments)} community assignments")
171
+
172
+ return assignments
173
+
174
+ # STEP 4: LLM Inference Methods
175
+ def _cerebras_inference(self, system_prompt: str, user_prompt: str) -> str:
176
+ """Call Cerebras API for inference"""
177
+ try:
178
+ # Calculate dynamic parameters based on community size and complexity
179
+ total_nodes = self.nx_graph.number_of_nodes() if hasattr(self, 'nx_graph') else 100
180
+ complexity_factor = min(1.0, total_nodes / 1000)
181
+
182
+ # Adaptive temperature: higher for complex graphs to encourage creativity
183
+ dynamic_temperature = round(0.1 + (complexity_factor * 0.4), 2) # Range: 0.1-0.5
184
+
185
+ # Adaptive tokens: more for larger/complex summaries
186
+ dynamic_tokens = int(300 + (complexity_factor * 400)) # Range: 300-700
187
+
188
+ response = self.cerebras_client.chat.completions.create(
189
+ model=self.model_name,
190
+ messages=[
191
+ {"role": "system", "content": system_prompt},
192
+ {"role": "user", "content": user_prompt}
193
+ ],
194
+ temperature=dynamic_temperature,
195
+ max_tokens=dynamic_tokens
196
+ )
197
+
198
+ if not response or not response.choices or not response.choices[0].message.content:
199
+ raise ValueError("Empty response from Cerebras")
200
+
201
+ return response.choices[0].message.content.strip()
202
+
203
+ except Exception as e:
204
+ logger.error(f"Cerebras inference error: {e}")
205
+ raise e
206
+
207
+ def _gemini_inference(self, system_prompt: str, user_prompt: str) -> str:
208
+ """Call Gemini API for inference"""
209
+ try:
210
+ # Calculate dynamic generation config based on graph complexity
211
+ total_nodes = self.nx_graph.number_of_nodes() if hasattr(self, 'nx_graph') else 100
212
+ complexity_factor = min(1.0, total_nodes / 1000)
213
+
214
+ # Adaptive temperature and tokens for Gemini
215
+ dynamic_temperature = round(0.1 + (complexity_factor * 0.4), 2)
216
+ dynamic_tokens = int(300 + (complexity_factor * 400))
217
+
218
+ generation_config = {
219
+ "temperature": dynamic_temperature,
220
+ "max_output_tokens": dynamic_tokens,
221
+ "candidate_count": 1
222
+ }
223
+
224
+ combined_prompt = f"{system_prompt}\n\n{user_prompt}"
225
+ response = self.gemini_model.generate_content(
226
+ combined_prompt,
227
+ generation_config=generation_config
228
+ )
229
+
230
+ if not response or not response.text:
231
+ raise ValueError("Empty response from Gemini")
232
+
233
+ return response.text.strip()
234
+
235
+ except Exception as e:
236
+ logger.error(f"Gemini inference error: {e}")
237
+ raise e
238
+
239
+ # STEP 5: Generate Community Summaries
240
+ def _generate_community_summaries(self) -> Dict[int, str]:
241
+ """Generate LLM summaries for each community"""
242
+ logger.info("📝 Generating community summaries with LLM...")
243
+ logger.info(f" Total communities to summarize: {len(self.community_stats)}")
244
+
245
+ summaries = {}
246
+
247
+ # Group nodes by community
248
+ communities = defaultdict(list)
249
+ for node_id, comm_id in self.community_assignments.items():
250
+ communities[comm_id].append(node_id)
251
+
252
+ start_time = time.time()
253
+
254
+ for idx, (comm_id_str, stats) in enumerate(self.community_stats.items(), 1):
255
+ comm_id = int(comm_id_str)
256
+
257
+ logger.info(f" Processing community {idx}/{len(self.community_stats)}: comm-{comm_id} ({stats['member_count']} members)")
258
+
259
+ # Get top entities by centrality
260
+ node_ids = communities[comm_id]
261
+ subgraph = self.nx_graph.subgraph(node_ids)
262
+
263
+ # Get nodes sorted by degree centrality
264
+ centrality = nx.degree_centrality(subgraph)
265
+ top_nodes = sorted(centrality.items(), key=lambda x: x[1], reverse=True)[:15]
266
+
267
+ # Prepare entity information for LLM
268
+ entity_info = []
269
+ for node_id, _ in top_nodes:
270
+ node_data = self.nx_graph.nodes[node_id]
271
+ entity_info.append({
272
+ "name": node_data.get("name", "Unknown"),
273
+ "type": node_data.get("type", "Unknown"),
274
+ "description": node_data.get("description", "")[:150] # Limit length
275
+ })
276
+
277
+ # Create LLM prompt
278
+ # Senior-developer style system/user prompts with strict output schema
279
+ # Calculate dynamic topic count based on community size
280
+ topic_count = max(2, min(5, stats['member_count'] // 3)) # Scale with community size
281
+
282
+ system_prompt = (
283
+ "You are a specialized knowledge graph summarization assistant. Your task is to analyze community "
284
+ "structures and generate comprehensive summaries for graph-based retrieval systems.\n\n"
285
+ "CONSTITUTIONAL AI PRINCIPLES:\n"
286
+ "1. Content-Adaptive: Generate summaries based on actual community composition and statistics\n"
287
+ "2. Context-Aware: Consider entity relationships and community density in summarization\n"
288
+ "3. Quality-First: Prioritize accuracy and relevance over brevity\n"
289
+ "4. Structured Output: Ensure consistent JSON format for programmatic consumption\n\n"
290
+ "SUMMARIZATION GUIDELINES:\n"
291
+ "- Analyze entity types, relationships, and community structure\n"
292
+ "- Identify key themes and concepts that define this community\n"
293
+ "- Generate topics that capture semantic meaning, not just entity names\n"
294
+ "- Assess confidence based on data completeness and coherence\n"
295
+ "- Use neutral, factual tone suitable for technical documentation"
296
+ )
297
+
298
+ user_prompt = (
299
+ f"Analyze the following community data and generate a structured summary.\n\n"
300
+ f"COMMUNITY STATISTICS:\n"
301
+ f"- Total Members: {stats['member_count']}\n"
302
+ f"- Internal Connections: {stats['internal_edges']}\n"
303
+ f"- Community Density: {stats['density']:.3f}\n"
304
+ f"- Connectivity Strength: {'High' if stats['density'] > 0.1 else 'Medium' if stats['density'] > 0.05 else 'Low'}\n\n"
305
+ f"TOP ENTITIES (name, type, description):\n{json.dumps(entity_info, indent=2)}\n\n"
306
+ f"OUTPUT FORMAT (strict JSON):\n"
307
+ f"{{\n"
308
+ f" \"summary\": \"2-3 sentence comprehensive summary of community purpose and characteristics\",\n"
309
+ f" \"primary_topics\": [\"topic_1\", \"topic_2\", \"topic_{topic_count}\"],\n"
310
+ f" \"confidence\": 0.85\n"
311
+ f"}}\n\n"
312
+ f"VALIDATION REQUIREMENTS:\n"
313
+ f"- summary: Must be 2-3 complete sentences describing community focus and key characteristics\n"
314
+ f"- primary_topics: Array of exactly {topic_count} descriptive phrases (not just entity names)\n"
315
+ f"- confidence: Float between 0.0-1.0 based on data quality and coherence\n\n"
316
+ f"IMPORTANT: Respond with ONLY the JSON object. No markdown formatting, no explanations, no code blocks."
317
+ )
318
+
319
+ # Call LLM for summary
320
+ try:
321
+ if self.llm_provider == "gemini":
322
+ summary_response = self._gemini_inference(system_prompt, user_prompt)
323
+ else: # cerebras
324
+ summary_response = self._cerebras_inference(system_prompt, user_prompt)
325
+
326
+ # Parse JSON response
327
+ parsed_summary = self._parse_summary_response(summary_response, comm_id)
328
+ if parsed_summary:
329
+ summaries[comm_id] = parsed_summary
330
+ else:
331
+ # Fallback to raw response if parsing fails
332
+ summaries[comm_id] = summary_response.strip()
333
+
334
+ # Log progress every 10 communities
335
+ if idx % 10 == 0:
336
+ elapsed = time.time() - start_time
337
+ avg_time = elapsed / idx
338
+ remaining = avg_time * (len(self.community_stats) - idx)
339
+ logger.info(f" Progress: {idx}/{len(self.community_stats)} ({elapsed:.1f}s elapsed, ~{remaining:.1f}s remaining)")
340
+
341
+ except Exception as e:
342
+ logger.error(f"❌ Failed to generate summary for community {comm_id}: {e}")
343
+ summaries[comm_id] = f"Community with {stats['member_count']} entities focused on {entity_info[0]['type'] if entity_info else 'various'} topics."
344
+
345
+ elapsed = time.time() - start_time
346
+ logger.info(f"✅ Generated {len(summaries)} community summaries in {elapsed:.1f}s")
347
+
348
+ return summaries
349
+
350
+ def _parse_summary_response(self, response: str, comm_id: int) -> str:
351
+ """Parse JSON summary response with fallback to text extraction"""
352
+ try:
353
+ # Clean response
354
+ cleaned_response = response.strip()
355
+
356
+ # Remove markdown formatting
357
+ if "```json" in cleaned_response:
358
+ parts = cleaned_response.split("```json")
359
+ if len(parts) > 1:
360
+ json_part = parts[1].split("```")[0].strip()
361
+ cleaned_response = json_part
362
+ elif "```" in cleaned_response:
363
+ parts = cleaned_response.split("```")
364
+ if len(parts) >= 3:
365
+ cleaned_response = parts[1].strip()
366
+
367
+ # Try to parse JSON
368
+ try:
369
+ summary_data = self._smart_json_parse_summary(cleaned_response)
370
+ if summary_data and isinstance(summary_data, dict):
371
+ summary_text = summary_data.get('summary', '')
372
+ if summary_text and len(summary_text.strip()) > 10:
373
+ return summary_text.strip()
374
+ except ValueError as e:
375
+ logger.debug(f"Summary JSON parsing failed for comm-{comm_id}: {e}")
376
+ except Exception as e:
377
+ logger.debug(f"Summary JSON parsing unexpected error for comm-{comm_id}: {e}")
378
+
379
+ except Exception as e:
380
+ logger.debug(f"Summary JSON parsing failed for comm-{comm_id}: {e}")
381
+
382
+ # Fallback: extract first meaningful sentence
383
+ try:
384
+ lines = response.split('\n')
385
+ for line in lines:
386
+ line = line.strip()
387
+ if len(line) > 20 and '.' in line and not line.startswith('{'):
388
+ return line
389
+ except Exception:
390
+ pass
391
+
392
+ return None
393
+
394
+ def _smart_json_parse_summary(self, json_text: str) -> Dict:
395
+ """
396
+ Simple 5-step JSON parsing approach (exactly same as Phase 1)
397
+ """
398
+ cleaned_text = json_text.strip()
399
+
400
+ # Step 1: orjson
401
+ try:
402
+ result = orjson.loads(cleaned_text.encode('utf-8'))
403
+ logger.debug("✅ Step 1: orjson succeeded")
404
+ return result
405
+ except Exception as e:
406
+ logger.debug(f"❌ Step 1: orjson failed - {e}")
407
+
408
+ # Step 2: json-repair
409
+ try:
410
+ repaired = repair_json(cleaned_text)
411
+ result = orjson.loads(repaired.encode('utf-8'))
412
+ logger.debug("✅ Step 2: json-repair + orjson succeeded")
413
+ return result
414
+ except Exception as e:
415
+ logger.debug(f"❌ Step 2: json-repair failed - {e}")
416
+
417
+ # Step 3: standard json
418
+ try:
419
+ result = json.loads(cleaned_text)
420
+ logger.debug("✅ Step 3: standard json succeeded")
421
+ return result
422
+ except Exception as e:
423
+ logger.debug(f"❌ Step 3: standard json failed - {e}")
424
+
425
+ # Step 4: json-repair + standard json
426
+ try:
427
+ repaired = repair_json(cleaned_text)
428
+ result = json.loads(repaired)
429
+ logger.debug("✅ Step 4: json-repair + standard json succeeded")
430
+ return result
431
+ except Exception as e:
432
+ logger.debug(f"❌ Step 4: json-repair + standard json failed - {e}")
433
+
434
+ # Step 5: All failed - this will trigger save failed txt files
435
+ raise ValueError("All 4 JSON parsing steps failed")
436
+
437
+ # STEP 6: Identify Key Entities
438
+ def _identify_key_entities(self) -> Dict[int, List[str]]:
439
+ """Identify key entities in each community based on centrality"""
440
+ logger.info("🔑 Identifying key entities per community...")
441
+
442
+ key_entities = {}
443
+
444
+ # Group nodes by community
445
+ communities = defaultdict(list)
446
+ for node_id, comm_id in self.community_assignments.items():
447
+ communities[comm_id].append(node_id)
448
+
449
+ for comm_id, node_ids in communities.items():
450
+ subgraph = self.nx_graph.subgraph(node_ids)
451
+
452
+ # Calculate degree centrality
453
+ centrality = nx.degree_centrality(subgraph)
454
+
455
+ # Get top 5 entities
456
+ top_nodes = sorted(centrality.items(), key=lambda x: x[1], reverse=True)[:5]
457
+
458
+ key_entities[comm_id] = [
459
+ self.nx_graph.nodes[node_id].get("name", "Unknown")
460
+ for node_id, _ in top_nodes
461
+ ]
462
+
463
+ logger.info(f"✅ Identified key entities for {len(key_entities)} communities")
464
+
465
+ return key_entities
466
+
467
+ # STEP 7: Create Community Nodes
468
+ def _create_community_nodes(self, community_summaries: Dict[int, str], key_entities: Dict[int, List[str]]) -> List[Dict]:
469
+ """Create community nodes for the graph"""
470
+ logger.info("🏗️ Creating community nodes...")
471
+
472
+ import uuid
473
+
474
+ community_nodes = []
475
+
476
+ for comm_id_str, stats in self.community_stats.items():
477
+ comm_id = int(comm_id_str)
478
+
479
+ node = {
480
+ "id": f"community-{uuid.uuid4()}",
481
+ "elementId": f"community-{uuid.uuid4()}",
482
+ "labels": ["Community"],
483
+ "properties": {
484
+ "community_id": f"comm-{comm_id}",
485
+ "level": 1,
486
+ "member_count": stats["member_count"],
487
+ "internal_edges": stats["internal_edges"],
488
+ "density": round(stats["density"], 4),
489
+ "avg_degree": round(stats["avg_degree"], 2),
490
+ "summary": community_summaries.get(comm_id, ""),
491
+ "key_entities": key_entities.get(comm_id, []),
492
+ "created_date": datetime.now().isoformat()
493
+ }
494
+ }
495
+ community_nodes.append(node)
496
+
497
+ logger.info(f"✅ Created {len(community_nodes)} community nodes")
498
+
499
+ return community_nodes
500
+
501
+ # STEP 8: Create IN_COMMUNITY Relationships
502
+ def _create_in_community_relationships(self, community_nodes: List[Dict]) -> List[Dict]:
503
+ """Create IN_COMMUNITY relationships linking entities to communities"""
504
+ logger.info("Creating IN_COMMUNITY relationships...")
505
+
506
+ import uuid
507
+
508
+ # Create mapping from community_id to community node id
509
+ comm_id_to_node_id = {}
510
+ for node in community_nodes:
511
+ comm_id = node["properties"]["community_id"]
512
+ comm_id_to_node_id[comm_id] = node["id"]
513
+
514
+ relationships = []
515
+
516
+ for entity_id, comm_id in self.community_assignments.items():
517
+ comm_node_id = comm_id_to_node_id.get(f"comm-{comm_id}")
518
+
519
+ if comm_node_id:
520
+ # Calculate confidence based on community membership strength
521
+ entity_node = next((n for n in self.graph_data['nodes'] if n['id'] == entity_id), None)
522
+ if entity_node:
523
+ degree_centrality = entity_node.get('properties', {}).get('degree_centrality', 0.5)
524
+ # Higher centrality = higher confidence in community assignment
525
+ dynamic_confidence = round(0.6 + (degree_centrality * 0.4), 3) # Range: 0.6-1.0
526
+ else:
527
+ dynamic_confidence = 0.8 # Default for missing nodes
528
+
529
+ rel = {
530
+ "id": f"rel-{uuid.uuid4()}",
531
+ "startNode": entity_id,
532
+ "endNode": comm_node_id,
533
+ "type": "IN_COMMUNITY",
534
+ "properties": {
535
+ "confidence": dynamic_confidence,
536
+ "assigned_date": datetime.now().isoformat()
537
+ }
538
+ }
539
+ relationships.append(rel)
540
+
541
+ logger.info(f"✅ Created {len(relationships)} IN_COMMUNITY relationships")
542
+
543
+ return relationships
544
+
545
+ # STEP 9: DRIFT Search Metadata Generation
546
+ def _generate_drift_metadata(self, community_summaries: Dict[int, str], key_entities: Dict[int, List[str]]) -> Dict:
547
+ """Generate DRIFT search metadata using existing embedding infrastructure"""
548
+ logger.info("🔍 Generating DRIFT search metadata...")
549
+
550
+ if not self.embedding_model:
551
+ logger.warning("⚠️ Embedding model not available, skipping DRIFT metadata")
552
+ return {}
553
+
554
+ # Calculate dynamic values from actual graph data
555
+ total_communities = len(community_summaries)
556
+ total_nodes = self.nx_graph.number_of_nodes()
557
+ total_edges = self.nx_graph.number_of_edges()
558
+ avg_community_size = sum(self.community_stats.get(str(i), {}).get("member_count", 0)
559
+ for i in community_summaries.keys()) / total_communities if total_communities > 0 else 0
560
+ graph_density = total_edges / (total_nodes * (total_nodes - 1) / 2) if total_nodes > 1 else 0
561
+
562
+ # Calculate dynamic thresholds based on graph complexity
563
+ complexity_factor = min(1.0, (total_nodes + total_edges) / 10000) # Scale 0-1 based on graph size
564
+ base_confidence = 0.6 + (complexity_factor * 0.3) # Range: 0.6-0.9
565
+ base_response_time = 1.0 + (complexity_factor * 3.0) # Range: 1-4 seconds
566
+ base_memory = int(20 + (avg_community_size * complexity_factor * 5)) # Scale with size
567
+
568
+ # Adaptive configuration based on graph characteristics
569
+ max_communities_for_primer = min(total_communities, max(2, total_communities // 4))
570
+ lightweight_communities = max(1, max_communities_for_primer // 2)
571
+ standard_communities = max(2, int(max_communities_for_primer // 1.5))
572
+ comprehensive_communities = max_communities_for_primer
573
+
574
+ # Calculate dynamic iteration counts based on community distribution
575
+ max_iter = max(2, min(5, int(total_communities / 10) + 2))
576
+ hyde_count = max(2, min(5, int(avg_community_size / 5) + 2))
577
+
578
+ drift_metadata = {
579
+ "version": "1.0",
580
+ "generated_timestamp": datetime.now().isoformat(),
581
+ "configuration": {
582
+ "max_iterations": max_iter,
583
+ "confidence_threshold": round(base_confidence + 0.1, 2),
584
+ "top_k_communities": max_communities_for_primer,
585
+ "hyde_expansion_count": hyde_count,
586
+ "termination_criteria": "confidence_or_max_iterations"
587
+ },
588
+ "query_routing_config": {
589
+ "lightweight_drift": {
590
+ "triggers": ["single_entity", "simple_fact", "definition_query"],
591
+ "config": {
592
+ "primer_communities": int(lightweight_communities),
593
+ "follow_up_iterations": max(1, max_iter - 2),
594
+ "confidence_threshold": round(base_confidence, 2)
595
+ }
596
+ },
597
+ "standard_drift": {
598
+ "triggers": ["multi_entity", "relationship_query", "how_does"],
599
+ "config": {
600
+ "primer_communities": int(standard_communities),
601
+ "follow_up_iterations": max(1, max_iter - 1),
602
+ "confidence_threshold": round(base_confidence + 0.1, 2)
603
+ }
604
+ },
605
+ "comprehensive_drift": {
606
+ "triggers": ["analyze", "compare", "implications", "strategy"],
607
+ "config": {
608
+ "primer_communities": int(comprehensive_communities),
609
+ "follow_up_iterations": max_iter,
610
+ "confidence_threshold": round(base_confidence + 0.2, 2)
611
+ }
612
+ }
613
+ },
614
+ "performance_monitoring": {
615
+ "response_time_targets": {
616
+ "p50": round(base_response_time * 1.0, 1),
617
+ "p95": round(base_response_time * 2.5, 1),
618
+ "p99": round(base_response_time * 5.0, 1)
619
+ },
620
+ "resource_tracking": {
621
+ "memory_per_query": base_memory,
622
+ "cache_hit_rate_target": round(0.5 + (complexity_factor * 0.3), 2)
623
+ },
624
+ "bottleneck_identification": ["community_ranking", "follow_up_generation", "embedding_computation"]
625
+ },
626
+ "community_search_index": {},
627
+ "search_optimization": {
628
+ "total_communities": total_communities,
629
+ "avg_community_size": round(avg_community_size, 1),
630
+ "graph_density": round(graph_density, 6),
631
+ "total_nodes": total_nodes,
632
+ "total_edges": total_edges,
633
+ "max_primer_communities": max_communities_for_primer
634
+ }
635
+ }
636
+
637
+ # Process each community
638
+ for comm_id, summary in community_summaries.items():
639
+ comm_key = f"comm-{comm_id}"
640
+
641
+ try:
642
+ # Generate embeddings using existing HuggingFace model
643
+ summary_embedding = self.embedding_model.get_text_embedding(summary)
644
+ hyde_embeddings = self._generate_hyde_embeddings(summary)
645
+ follow_up_questions = self._generate_follow_up_questions(summary, comm_id, key_entities.get(comm_id, []))
646
+
647
+ # Add to search index
648
+ drift_metadata["community_search_index"][comm_key] = {
649
+ "summary": summary,
650
+ "key_entities": key_entities.get(comm_id, []),
651
+ "embeddings": {
652
+ "summary_embedding": summary_embedding,
653
+ "hyde_embeddings": hyde_embeddings
654
+ },
655
+ "follow_up_templates": follow_up_questions,
656
+ "statistics": self.community_stats.get(str(comm_id), {})
657
+ }
658
+
659
+ except Exception as e:
660
+ logger.warning(f"⚠️ Failed to generate metadata for {comm_key}: {e}")
661
+ continue
662
+
663
+ logger.info(f"✅ Generated DRIFT metadata for {len(drift_metadata['community_search_index'])} communities")
664
+ return drift_metadata
665
+
666
+ def _generate_hyde_embeddings(self, community_summary: str) -> List[List[float]]:
667
+ """Generate HyDE embeddings for enhanced recall"""
668
+
669
+ # Create 3 hypothetical document variations
670
+ hyde_templates = [
671
+ f"Research analysis and findings: {community_summary}",
672
+ f"Technical report and documentation: {community_summary}",
673
+ f"Business implications and strategic analysis: {community_summary}"
674
+ ]
675
+
676
+ hyde_embeddings = []
677
+ for template in hyde_templates:
678
+ try:
679
+ embedding = self.embedding_model.get_text_embedding(template)
680
+ hyde_embeddings.append(embedding)
681
+ except Exception as e:
682
+ logger.warning(f"⚠️ HyDE embedding generation failed: {e}")
683
+ continue
684
+
685
+ return hyde_embeddings
686
+
687
+ def _generate_follow_up_questions(self, community_summary: str, comm_id: int, key_entities: List[str]) -> List[Dict]:
688
+ """Generate follow-up questions using existing LLM infrastructure"""
689
+
690
+ # Professional system prompt matching Phase 1 style
691
+ system_prompt = (
692
+ "You are a specialized DRIFT search question generation assistant. Your task is to analyze community "
693
+ "summaries and generate targeted follow-up questions for iterative knowledge graph exploration.\n\n"
694
+ "CONSTITUTIONAL AI PRINCIPLES:\n"
695
+ "1. Context-Adaptive: Generate questions based on actual community content and entities\n"
696
+ "2. Search-Aware: Choose appropriate search types to guide query routing optimization\n"
697
+ "3. Relevance-First: Prioritize questions that expand understanding of community themes\n"
698
+ "4. Structured Output: Ensure consistent JSON format for programmatic consumption\n\n"
699
+ "QUESTION GENERATION GUIDELINES:\n"
700
+ "- Analyze community summary and key entities to identify knowledge gaps\n"
701
+ "- Generate questions that would reveal additional relevant information\n"
702
+ "- Use local search for entity-specific queries, relationship for connections, global for themes\n"
703
+ "- Assign relevance scores based on potential value for understanding the community\n"
704
+ "- Target entities should guide search focus and retrieval optimization"
705
+ )
706
+
707
+ user_prompt = (
708
+ f"Analyze the following community data and generate targeted follow-up questions.\n\n"
709
+ f"COMMUNITY SUMMARY:\n{community_summary}\n\n"
710
+ f"KEY ENTITIES: {', '.join(key_entities[:5]) if key_entities else 'No specific entities identified'}\n\n"
711
+ f"TASK: Generate exactly 3 strategic follow-up questions for DRIFT search.\n\n"
712
+ f"OUTPUT FORMAT (strict JSON):\n"
713
+ f"[\n"
714
+ f" {{\n"
715
+ f" \"question\": \"Specific, actionable question about the community\",\n"
716
+ f" \"relevance_score\": 0.85,\n"
717
+ f" \"search_type\": \"local\",\n"
718
+ f" \"target_entities\": [\"entity1\", \"entity2\"]\n"
719
+ f" }}\n"
720
+ f"]\n\n"
721
+ f"VALIDATION REQUIREMENTS:\n"
722
+ f"- question: Must be a clear, specific question that expands community understanding\n"
723
+ f"- relevance_score: Float 0.0-1.0 based on potential value for knowledge expansion\n"
724
+ f"- search_type: Must be one of 'local', 'relationship', or 'global'\n"
725
+ f"- target_entities: Array of relevant entity names from the key entities list\n\n"
726
+ f"IMPORTANT: Respond with ONLY the JSON array. No markdown formatting, no explanations, no code blocks."
727
+ )
728
+
729
+ try:
730
+ # Use existing LLM infrastructure
731
+ if self.llm_provider == "cerebras":
732
+ response = self._cerebras_inference(system_prompt, user_prompt)
733
+ else:
734
+ response = self._gemini_inference(system_prompt, user_prompt)
735
+
736
+ # Parse LLM response to structured questions
737
+ questions = self._parse_questions_response(response, key_entities)
738
+ return questions
739
+
740
+ except Exception as e:
741
+ logger.error(f"❌ Question generation failed for comm-{comm_id}: {e}")
742
+ return []
743
+
744
+ def _parse_questions_response(self, response: str, key_entities: List[str]) -> List[Dict]:
745
+ """Parse LLM response into structured questions using robust multi-strategy approach"""
746
+ try:
747
+ # Calculate dynamic default relevance based on community statistics
748
+ total_nodes = self.nx_graph.number_of_nodes() if hasattr(self, 'nx_graph') else 100
749
+ node_density = min(1.0, total_nodes / 500) # Scale 0-1
750
+ default_relevance = round(0.5 + (node_density * 0.4), 2) # Range: 0.5-0.9
751
+ max_questions = max(2, min(5, len(key_entities) + 1)) # Adaptive question count
752
+
753
+ # Strategy 1: JSON array extraction with regex
754
+ try:
755
+ import re
756
+ match = re.search(r"(\[\s*\{[\s\S]*?\}\s*\])", response)
757
+ if match:
758
+ json_str = match.group(1)
759
+ try:
760
+ questions = self._smart_json_parse_questions(json_str)
761
+ if questions:
762
+ return self._validate_and_normalize_questions(questions, key_entities, default_relevance, max_questions)
763
+ except ValueError:
764
+ pass # Continue to next strategy if JSON parsing fails
765
+ except Exception:
766
+ pass
767
+
768
+ # Strategy 2: Multiple JSON objects extraction
769
+ try:
770
+ import re
771
+ pattern = r'\{[^{}]*"question"[^{}]*\}'
772
+ matches = re.findall(pattern, response)
773
+ if matches:
774
+ json_array = "[" + ",".join(matches) + "]"
775
+ try:
776
+ questions = self._smart_json_parse_questions(json_array)
777
+ if questions:
778
+ return self._validate_and_normalize_questions(questions, key_entities, default_relevance, max_questions)
779
+ except ValueError:
780
+ pass # Continue to next strategy if JSON parsing fails
781
+ except Exception:
782
+ pass
783
+
784
+ # Strategy 3: Markdown list extraction
785
+ try:
786
+ questions = self._parse_markdown_questions(response, key_entities, default_relevance)
787
+ if questions:
788
+ return self._validate_and_normalize_questions(questions, key_entities, default_relevance, max_questions)
789
+ except Exception:
790
+ pass
791
+
792
+ # Strategy 4: Generate default questions based on entities
793
+ return self._generate_default_questions(key_entities, default_relevance, max_questions)
794
+
795
+ except Exception as e:
796
+ logger.warning(f"⚠️ All question parsing strategies failed: {e}")
797
+ return self._generate_default_questions(key_entities, 0.7, 3)
798
+
799
+ def _smart_json_parse_questions(self, json_text: str) -> List[Dict]:
800
+ """
801
+ Simple 5-step JSON parsing approach (exactly same as Phase 1)
802
+ """
803
+ cleaned_text = json_text.strip()
804
+
805
+ # Step 1: orjson
806
+ try:
807
+ result = orjson.loads(cleaned_text.encode('utf-8'))
808
+ logger.debug("✅ Step 1: orjson succeeded")
809
+ return result
810
+ except Exception as e:
811
+ logger.debug(f"❌ Step 1: orjson failed - {e}")
812
+
813
+ # Step 2: json-repair
814
+ try:
815
+ repaired = repair_json(cleaned_text)
816
+ result = orjson.loads(repaired.encode('utf-8'))
817
+ logger.debug("✅ Step 2: json-repair + orjson succeeded")
818
+ return result
819
+ except Exception as e:
820
+ logger.debug(f"❌ Step 2: json-repair failed - {e}")
821
+
822
+ # Step 3: standard json
823
+ try:
824
+ result = json.loads(cleaned_text)
825
+ logger.debug("✅ Step 3: standard json succeeded")
826
+ return result
827
+ except Exception as e:
828
+ logger.debug(f"❌ Step 3: standard json failed - {e}")
829
+
830
+ # Step 4: json-repair + standard json
831
+ try:
832
+ repaired = repair_json(cleaned_text)
833
+ result = json.loads(repaired)
834
+ logger.debug("✅ Step 4: json-repair + standard json succeeded")
835
+ return result
836
+ except Exception as e:
837
+ logger.debug(f"❌ Step 4: json-repair + standard json failed - {e}")
838
+
839
+ # Step 5: All failed - this will trigger save failed txt files
840
+ raise ValueError("All 4 JSON parsing steps failed")
841
+
842
+ def _parse_markdown_questions(self, response: str, key_entities: List[str], default_relevance: float) -> List[Dict]:
843
+ """Parse questions from markdown or plain text format"""
844
+ questions = []
845
+
846
+ # Look for numbered lists or bullet points
847
+ import re
848
+ patterns = [
849
+ r'\d+\.\s*(.+?)(?=\n\d+\.|\n-|\n\*|$)', # Numbered list
850
+ r'-\s*(.+?)(?=\n-|\n\*|\n\d+\.|$)', # Dash list
851
+ r'\*\s*(.+?)(?=\n\*|\n-|\n\d+\.|$)' # Asterisk list
852
+ ]
853
+
854
+ for pattern in patterns:
855
+ matches = re.findall(pattern, response, re.MULTILINE | re.DOTALL)
856
+ if matches and len(matches) >= 2:
857
+ for i, match in enumerate(matches[:5]): # Max 5 questions
858
+ question_text = match.strip().replace('\n', ' ')
859
+ if len(question_text) > 10: # Reasonable question length
860
+ search_type = 'global' if any(word in question_text.lower()
861
+ for word in ['analyze', 'compare', 'overall', 'trends']) else 'local'
862
+ questions.append({
863
+ 'question': question_text,
864
+ 'relevance_score': max(0.6, default_relevance - (i * 0.1)),
865
+ 'search_type': search_type,
866
+ 'target_entities': key_entities[:2] if key_entities else []
867
+ })
868
+ break
869
+
870
+ return questions
871
+
872
+ def _generate_default_questions(self, key_entities: List[str], default_relevance: float, max_questions: int) -> List[Dict]:
873
+ """Generate default questions when parsing fails"""
874
+ if not key_entities:
875
+ return []
876
+
877
+ # Template questions based on entity analysis
878
+ question_templates = [
879
+ ("What is {entity} and what role does it play?", "local"),
880
+ ("How does {entity} relate to other entities in this community?", "relationship"),
881
+ ("What are the key characteristics and properties of {entity}?", "local"),
882
+ ("What trends or patterns involve {entity}?", "global"),
883
+ ("How might {entity} impact the broader context?", "global")
884
+ ]
885
+
886
+ questions = []
887
+ entities_to_use = key_entities[:max_questions]
888
+
889
+ for i, entity in enumerate(entities_to_use):
890
+ if i < len(question_templates):
891
+ template, search_type = question_templates[i]
892
+ question = template.format(entity=entity)
893
+ questions.append({
894
+ 'question': question,
895
+ 'relevance_score': max(0.6, default_relevance - (i * 0.05)),
896
+ 'search_type': search_type,
897
+ 'target_entities': [entity]
898
+ })
899
+
900
+ return questions
901
+
902
+ def _validate_and_normalize_questions(self, questions: List[Dict], key_entities: List[str],
903
+ default_relevance: float, max_questions: int) -> List[Dict]:
904
+ """Validate and normalize question format"""
905
+ normalized = []
906
+
907
+ for q in questions:
908
+ if not isinstance(q, dict):
909
+ continue
910
+
911
+ # Extract question text
912
+ question = q.get('question') or q.get('q') or q.get('text')
913
+ if not question or len(str(question).strip()) < 5:
914
+ continue
915
+
916
+ # Extract and validate relevance score
917
+ relevance = q.get('relevance_score', default_relevance)
918
+ try:
919
+ relevance = float(relevance)
920
+ if relevance <= 0 or relevance > 1:
921
+ relevance = default_relevance
922
+ except (ValueError, TypeError):
923
+ relevance = default_relevance
924
+
925
+ # Extract and validate search type
926
+ search_type = q.get('search_type', 'local')
927
+ if search_type not in ('local', 'relationship', 'global'):
928
+ search_type = 'local'
929
+
930
+ # Extract target entities
931
+ target_entities = q.get('target_entities', [])
932
+ if not isinstance(target_entities, list):
933
+ target_entities = []
934
+
935
+ # Ensure we have some target entities
936
+ if not target_entities and key_entities:
937
+ target_entities = key_entities[:2]
938
+
939
+ normalized.append({
940
+ 'question': str(question).strip(),
941
+ 'relevance_score': round(relevance, 2),
942
+ 'search_type': search_type,
943
+ 'target_entities': target_entities
944
+ })
945
+
946
+ if len(normalized) >= max_questions:
947
+ break
948
+
949
+ return normalized
950
+
951
+ # STEP 10: Main Processing Entry Point
952
+ def generate_summaries(self, input_path: str = None, output_path: str = None) -> bool:
953
+ """Main entry point for Phase 3"""
954
+ if output_path is None:
955
+ output_path = "workspace/graph_data/graph-data-final.json"
956
+
957
+ logger.info("🚀 Starting Phase 3: Community Summarization")
958
+ logger.info("=" * 60)
959
+
960
+ start_time = time.time()
961
+
962
+ # Step 1: Load Phase 2 output
963
+ if not self.load_graph_data(input_path):
964
+ return False
965
+
966
+ # Step 2: Build NetworkX graph
967
+ self.nx_graph = self._build_networkx_graph()
968
+
969
+ # Step 3: Extract community assignments
970
+ self.community_assignments = self._extract_community_assignments()
971
+
972
+ # Step 4: Generate LLM summaries
973
+ community_summaries = self._generate_community_summaries()
974
+
975
+ # Step 5: Identify key entities
976
+ key_entities = self._identify_key_entities()
977
+
978
+ # Step 6: Create community nodes
979
+ community_nodes = self._create_community_nodes(community_summaries, key_entities)
980
+
981
+ # Step 7: Create IN_COMMUNITY relationships
982
+ community_relationships = self._create_in_community_relationships(community_nodes)
983
+
984
+ # Step 8: Merge everything
985
+ self.graph_data["nodes"].extend(community_nodes)
986
+ self.graph_data["relationships"].extend(community_relationships)
987
+
988
+ # Step 9: Add communities section
989
+ self.graph_data["communities"] = {
990
+ "algorithm": "Leiden",
991
+ "total_communities": len(community_summaries),
992
+ "modularity_score": self.graph_data["metadata"]["community_detection"]["modularity_score"],
993
+ "summaries": {
994
+ f"comm-{k}": v for k, v in community_summaries.items()
995
+ }
996
+ }
997
+
998
+ # Step 10: Generate DRIFT search metadata
999
+ drift_metadata = self._generate_drift_metadata(community_summaries, key_entities)
1000
+ if drift_metadata:
1001
+ self.graph_data["drift_search_metadata"] = drift_metadata
1002
+ logger.info("✅ Added DRIFT search metadata to graph data")
1003
+
1004
+ # Step 11: Clean up temporary data
1005
+ if "community_stats" in self.graph_data:
1006
+ del self.graph_data["community_stats"]
1007
+
1008
+ # Step 12: Update metadata
1009
+ self.graph_data["metadata"]["phase"] = "final"
1010
+ self.graph_data["metadata"]["entity_count"] = len([n for n in self.graph_data["nodes"] if "Community" not in n["labels"]])
1011
+ self.graph_data["metadata"]["community_count"] = len(community_nodes)
1012
+ self.graph_data["metadata"]["total_node_count"] = len(self.graph_data["nodes"])
1013
+ self.graph_data["metadata"]["total_relationship_count"] = len(self.graph_data["relationships"])
1014
+
1015
+ # Step 13: Save final output
1016
+ if self._save_final_output(output_path):
1017
+ elapsed = time.time() - start_time
1018
+ logger.info("=" * 60)
1019
+ logger.info(f"✅ Phase 3 completed successfully in {elapsed:.1f}s")
1020
+ logger.info("📊 Final stats:")
1021
+ logger.info(f" - Total nodes: {len(self.graph_data['nodes'])}")
1022
+ logger.info(f" - Entity nodes: {self.graph_data['metadata']['entity_count']}")
1023
+ logger.info(f" - Community nodes: {len(community_nodes)}")
1024
+ logger.info(f" - Total relationships: {len(self.graph_data['relationships'])}")
1025
+ logger.info(f" - Communities with summaries: {len(community_summaries)}")
1026
+ logger.info(f" - Output saved to: {output_path}")
1027
+ return True
1028
+ else:
1029
+ return False
1030
+
1031
+ # STEP 14: Save Final Output
1032
+ def _save_final_output(self, output_path: str) -> bool:
1033
+ """Save graph-data-final.json with DRIFT search metadata"""
1034
+ try:
1035
+ # Ensure output directory exists
1036
+ output_dir = Path(output_path).parent
1037
+ output_dir.mkdir(parents=True, exist_ok=True)
1038
+
1039
+ # Save final output
1040
+ with open(output_path, 'w', encoding='utf-8') as f:
1041
+ json.dump(self.graph_data, f, indent=2, ensure_ascii=False)
1042
+
1043
+ # Calculate file size
1044
+ output_size = os.path.getsize(output_path)
1045
+ output_size_mb = output_size / (1024 * 1024)
1046
+
1047
+ logger.info(f"💾 Saved final output: {output_path} ({output_size_mb:.2f} MB)")
1048
+
1049
+ return True
1050
+
1051
+ except Exception as e:
1052
+ logger.error(f"❌ Error saving final output: {e}")
1053
+ return False
1054
+
1055
+
1056
+ # STEP 15: Main Entry Point
1057
+ def main():
1058
+ """Main function to run Phase 3: Community Summarization with DRIFT Search Metadata"""
1059
+ logger.info("🚀 GraphRAG Phase 3: Community Summarization + DRIFT Search Metadata")
1060
+ logger.info(" Input: graph-data-phase-2.json (from Phase 2)")
1061
+ logger.info(" Output: graph-data-final.json (with DRIFT search metadata)")
1062
+ logger.info("")
1063
+
1064
+ # Choose LLM provider from environment or default to cerebras
1065
+ llm_provider = os.getenv("GRAPH_LLM_PROVIDER", "cerebras").lower()
1066
+ logger.info(f" Using LLM provider: {llm_provider.upper()}")
1067
+
1068
+ try:
1069
+ # Initialize Phase 3 processor
1070
+ processor = GraphBuilderPhase3(llm_provider=llm_provider)
1071
+
1072
+ # Generate summaries
1073
+ success = processor.generate_summaries()
1074
+
1075
+ if success:
1076
+ logger.info("")
1077
+ logger.info("✅ Phase 3 completed successfully!")
1078
+ logger.info("� DRIFT search metadata generated and included")
1079
+ logger.info("�📋 Next step: Upload to Neo4j using 3b_save_to_graph_db.py")
1080
+ logger.info(" The graph-data-final.json is now ready for Neo4j import with DRIFT capabilities")
1081
+ return 0
1082
+ else:
1083
+ logger.error("")
1084
+ logger.error("❌ Phase 3 failed")
1085
+ logger.error(" Please check the logs above for details")
1086
+ return 1
1087
+
1088
+ except Exception as e:
1089
+ logger.error(f"❌ Phase 3 pipeline failed: {e}")
1090
+ import traceback
1091
+ logger.error(traceback.format_exc())
1092
+ return 1
1093
+
1094
+
1095
+ if __name__ == "__main__":
1096
+ exit(main())
3_save_to_vector_db.ipynb ADDED
@@ -0,0 +1,329 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Save Markdown text into Vector DB"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "markdown",
12
+ "metadata": {},
13
+ "source": [
14
+ "## Step-1: Config"
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": 1,
20
+ "metadata": {},
21
+ "outputs": [],
22
+ "source": [
23
+ "from my_config import MY_CONFIG"
24
+ ]
25
+ },
26
+ {
27
+ "cell_type": "markdown",
28
+ "metadata": {},
29
+ "source": [
30
+ "## Step-2: Read Markdown"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": 2,
36
+ "metadata": {},
37
+ "outputs": [],
38
+ "source": [
39
+ "import os\n",
40
+ "import glob\n",
41
+ "\n",
42
+ "pattern = os.path.join(MY_CONFIG.PROCESSED_DATA_DIR, '*.md')\n",
43
+ "md_file_count = len(glob.glob(pattern, recursive=True)) "
44
+ ]
45
+ },
46
+ {
47
+ "cell_type": "code",
48
+ "execution_count": 3,
49
+ "metadata": {},
50
+ "outputs": [
51
+ {
52
+ "name": "stdout",
53
+ "output_type": "stream",
54
+ "text": [
55
+ "Loaded 96 documents from 96 files\n"
56
+ ]
57
+ }
58
+ ],
59
+ "source": [
60
+ "from llama_index.core import SimpleDirectoryReader\n",
61
+ "\n",
62
+ "reader = SimpleDirectoryReader(input_dir=MY_CONFIG.PROCESSED_DATA_DIR, recursive=False , required_exts=[\".md\"])\n",
63
+ "documents = reader.load_data()\n",
64
+ "\n",
65
+ "print (f\"Loaded {len(documents)} documents from {md_file_count} files\")\n"
66
+ ]
67
+ },
68
+ {
69
+ "cell_type": "code",
70
+ "execution_count": 4,
71
+ "metadata": {},
72
+ "outputs": [
73
+ {
74
+ "name": "stdout",
75
+ "output_type": "stream",
76
+ "text": [
77
+ "Doc ID: 20eef2cd-ee21-4dd4-baf6-eda09d5d793b\n",
78
+ "Text: # Building the open future of AI We are technology developers,\n",
79
+ "researchers, industry leaders and advocates who collaborate to advance\n",
80
+ "safe, responsible AI rooted in open innovation. ![Conference\n",
81
+ "Speaker](https://images.prismic.io/ai-alliance/Zy08cq8jQArT0jJI_Imagef\n",
82
+ "romNotion.jpeg?auto=format%2Ccompress&fit=max&w=3840) ![Skills &\n",
83
+ "Education](htt...\n"
84
+ ]
85
+ }
86
+ ],
87
+ "source": [
88
+ "## Inspect a sample doc\n",
89
+ "print (documents[0])"
90
+ ]
91
+ },
92
+ {
93
+ "cell_type": "markdown",
94
+ "metadata": {},
95
+ "source": [
96
+ "## Step-3: Create Chunks"
97
+ ]
98
+ },
99
+ {
100
+ "cell_type": "code",
101
+ "execution_count": 5,
102
+ "metadata": {},
103
+ "outputs": [
104
+ {
105
+ "name": "stdout",
106
+ "output_type": "stream",
107
+ "text": [
108
+ "Created 223 chunks from 96 documents\n"
109
+ ]
110
+ }
111
+ ],
112
+ "source": [
113
+ "from llama_index.core import Document\n",
114
+ "from llama_index.core.node_parser import SentenceSplitter\n",
115
+ "\n",
116
+ "parser = SentenceSplitter(chunk_size=MY_CONFIG.CHUNK_SIZE, chunk_overlap=MY_CONFIG.CHUNK_OVERLAP)\n",
117
+ "nodes = parser.get_nodes_from_documents(documents)\n",
118
+ "print(f\"Created {len(nodes)} chunks from {len(documents)} documents\")"
119
+ ]
120
+ },
121
+ {
122
+ "cell_type": "markdown",
123
+ "metadata": {},
124
+ "source": [
125
+ "## Step-4: Setup Embedding Model"
126
+ ]
127
+ },
128
+ {
129
+ "cell_type": "code",
130
+ "execution_count": 6,
131
+ "metadata": {},
132
+ "outputs": [],
133
+ "source": [
134
+ "# If connection to https://huggingface.co/ failed, uncomment the following path\n",
135
+ "import os\n",
136
+ "os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'"
137
+ ]
138
+ },
139
+ {
140
+ "cell_type": "code",
141
+ "execution_count": 7,
142
+ "metadata": {},
143
+ "outputs": [
144
+ {
145
+ "name": "stderr",
146
+ "output_type": "stream",
147
+ "text": [
148
+ "/home/sujee/apps/anaconda3/envs/allycat-6/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
149
+ " from .autonotebook import tqdm as notebook_tqdm\n"
150
+ ]
151
+ }
152
+ ],
153
+ "source": [
154
+ "from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n",
155
+ "from llama_index.core import Settings\n",
156
+ "\n",
157
+ "Settings.embed_model = HuggingFaceEmbedding(\n",
158
+ " model_name = MY_CONFIG.EMBEDDING_MODEL\n",
159
+ ")"
160
+ ]
161
+ },
162
+ {
163
+ "cell_type": "markdown",
164
+ "metadata": {},
165
+ "source": [
166
+ "## Step-5: Connect to Milvus"
167
+ ]
168
+ },
169
+ {
170
+ "cell_type": "code",
171
+ "execution_count": 8,
172
+ "metadata": {},
173
+ "outputs": [
174
+ {
175
+ "name": "stdout",
176
+ "output_type": "stream",
177
+ "text": [
178
+ "✅ Connected to Milvus instance: workspace/rag_website_milvus.db\n"
179
+ ]
180
+ }
181
+ ],
182
+ "source": [
183
+ "## Clear up any old data\n",
184
+ "\n",
185
+ "from pymilvus import MilvusClient\n",
186
+ "\n",
187
+ "milvus_client = MilvusClient(MY_CONFIG.DB_URI)\n",
188
+ "print (\"✅ Connected to Milvus instance: \", MY_CONFIG.DB_URI )\n",
189
+ "\n",
190
+ "# if we already have a collection, clear it first\n",
191
+ "if milvus_client.has_collection(collection_name = MY_CONFIG.COLLECTION_NAME):\n",
192
+ " milvus_client.drop_collection(collection_name = MY_CONFIG.COLLECTION_NAME)\n",
193
+ " print ('✅ Cleared collection :', MY_CONFIG.COLLECTION_NAME)\n",
194
+ " "
195
+ ]
196
+ },
197
+ {
198
+ "cell_type": "code",
199
+ "execution_count": 9,
200
+ "metadata": {},
201
+ "outputs": [
202
+ {
203
+ "name": "stderr",
204
+ "output_type": "stream",
205
+ "text": [
206
+ "2025-05-12 23:36:12,218 [DEBUG][_create_connection]: Created new connection using: f81ea0e5320b44f7b5ba8b89f6aa43f7 (async_milvus_client.py:600)\n"
207
+ ]
208
+ },
209
+ {
210
+ "name": "stdout",
211
+ "output_type": "stream",
212
+ "text": [
213
+ "✅ Connected Llama-index to Milvus instance: workspace/rag_website_milvus.db\n"
214
+ ]
215
+ }
216
+ ],
217
+ "source": [
218
+ "# connect llama-index to vector db\n",
219
+ "\n",
220
+ "from llama_index.core import StorageContext\n",
221
+ "from llama_index.vector_stores.milvus import MilvusVectorStore\n",
222
+ "\n",
223
+ "vector_store = MilvusVectorStore(\n",
224
+ " uri = MY_CONFIG.DB_URI ,\n",
225
+ " dim = MY_CONFIG.EMBEDDING_LENGTH , \n",
226
+ " collection_name = MY_CONFIG.COLLECTION_NAME,\n",
227
+ " overwrite=True\n",
228
+ ")\n",
229
+ "storage_context = StorageContext.from_defaults(vector_store=vector_store)\n",
230
+ "\n",
231
+ "print (\"✅ Connected Llama-index to Milvus instance: \", MY_CONFIG.DB_URI )"
232
+ ]
233
+ },
234
+ {
235
+ "cell_type": "markdown",
236
+ "metadata": {},
237
+ "source": [
238
+ "## Step-6: Save to DB"
239
+ ]
240
+ },
241
+ {
242
+ "cell_type": "code",
243
+ "execution_count": 10,
244
+ "metadata": {},
245
+ "outputs": [
246
+ {
247
+ "name": "stdout",
248
+ "output_type": "stream",
249
+ "text": [
250
+ "CPU times: user 9 μs, sys: 0 ns, total: 9 μs\n",
251
+ "Wall time: 18.8 μs\n"
252
+ ]
253
+ }
254
+ ],
255
+ "source": [
256
+ "%%time\n",
257
+ "\n",
258
+ "## We save entire md documents into vector store\n",
259
+ "\n",
260
+ "# from llama_index.core import VectorStoreIndex\n",
261
+ "\n",
262
+ "# index = VectorStoreIndex.from_documents(\n",
263
+ "# documents, storage_context=storage_context\n",
264
+ "# )\n",
265
+ "# print (f\"✅ Saved {len(documents)} documents to db: {MY_CONFIG.DB_URI}\" )"
266
+ ]
267
+ },
268
+ {
269
+ "cell_type": "code",
270
+ "execution_count": 11,
271
+ "metadata": {},
272
+ "outputs": [
273
+ {
274
+ "name": "stdout",
275
+ "output_type": "stream",
276
+ "text": [
277
+ "Successfully stored 223 chunks in Milvus collection 'pages'\n",
278
+ "CPU times: user 900 ms, sys: 142 ms, total: 1.04 s\n",
279
+ "Wall time: 807 ms\n"
280
+ ]
281
+ }
282
+ ],
283
+ "source": [
284
+ "%%time \n",
285
+ "\n",
286
+ "# save chunks into vector db\n",
287
+ "\n",
288
+ "from llama_index.core import VectorStoreIndex\n",
289
+ "\n",
290
+ "index = VectorStoreIndex(\n",
291
+ " nodes=nodes,\n",
292
+ " storage_context=storage_context,\n",
293
+ " )\n",
294
+ "\n",
295
+ "print(f\"Successfully stored {len(nodes)} chunks in Milvus collection '{MY_CONFIG.COLLECTION_NAME}'\")\n"
296
+ ]
297
+ },
298
+ {
299
+ "cell_type": "code",
300
+ "execution_count": null,
301
+ "metadata": {},
302
+ "outputs": [],
303
+ "source": [
304
+ "milvus_client.close()"
305
+ ]
306
+ }
307
+ ],
308
+ "metadata": {
309
+ "kernelspec": {
310
+ "display_name": "allycat-6",
311
+ "language": "python",
312
+ "name": "python3"
313
+ },
314
+ "language_info": {
315
+ "codemirror_mode": {
316
+ "name": "ipython",
317
+ "version": 3
318
+ },
319
+ "file_extension": ".py",
320
+ "mimetype": "text/x-python",
321
+ "name": "python",
322
+ "nbconvert_exporter": "python",
323
+ "pygments_lexer": "ipython3",
324
+ "version": "3.11.11"
325
+ }
326
+ },
327
+ "nbformat": 4,
328
+ "nbformat_minor": 2
329
+ }
3_save_to_vector_db.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from my_config import MY_CONFIG
2
+ import os
3
+ import glob
4
+ from llama_index.core import SimpleDirectoryReader
5
+ from llama_index.core.node_parser import SentenceSplitter
6
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
7
+ from llama_index.core import Settings
8
+ from pymilvus import MilvusClient
9
+ from llama_index.core import StorageContext
10
+ from llama_index.vector_stores.milvus import MilvusVectorStore
11
+ from llama_index.core import VectorStoreIndex
12
+ import logging
13
+
14
+
15
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
16
+ logger = logging.getLogger(__name__)
17
+ # logger.setLevel(logging.INFO)
18
+
19
+ # Step-1: Read Markdown files
20
+ pattern = os.path.join(MY_CONFIG.PROCESSED_DATA_DIR, '*.md')
21
+ md_file_count = len(glob.glob(pattern, recursive=True))
22
+
23
+ reader = SimpleDirectoryReader(input_dir=MY_CONFIG.PROCESSED_DATA_DIR, recursive=False, required_exts=[".md"])
24
+ documents = reader.load_data()
25
+ logger.info (f"Loaded {len(documents)} documents from {md_file_count} files")
26
+
27
+ # Step-2: Create Chunks
28
+ parser = SentenceSplitter(chunk_size=MY_CONFIG.CHUNK_SIZE, chunk_overlap=MY_CONFIG.CHUNK_OVERLAP)
29
+ nodes = parser.get_nodes_from_documents(documents)
30
+ logger.info (f"Created {len(nodes)} chunks from {len(documents)} documents")
31
+
32
+ # Step-3: Setup Embedding Model
33
+ os.environ['HF_ENDPOINT'] = MY_CONFIG.HF_ENDPOINT
34
+
35
+ Settings.embed_model = HuggingFaceEmbedding(
36
+ model_name = MY_CONFIG.EMBEDDING_MODEL
37
+ )
38
+
39
+ # Step-4: Create 2 Vector Databases (Vector RAG and Hybrid GraphRAG databases)
40
+
41
+ databases_to_create = [
42
+ {
43
+ "name": "Vector RAG Only",
44
+ "uri": MY_CONFIG.MILVUS_URI_VECTOR,
45
+ "description": "For Vector RAG systems"
46
+ },
47
+ {
48
+ "name": "Hybrid GraphRAG",
49
+ "uri": MY_CONFIG.MILVUS_URI_HYBRID_GRAPH,
50
+ "description": "For Hybrid GraphRAG systems"
51
+ }
52
+ ]
53
+
54
+ for db_config in databases_to_create:
55
+ logger.info(f"📦 Creating {db_config['name']} database...")
56
+
57
+ # Connect to Milvus for this database
58
+ milvus_client = MilvusClient(db_config['uri'])
59
+ logger.info(f"✅ Connected to: {db_config['uri']}")
60
+
61
+ if milvus_client.has_collection(collection_name = MY_CONFIG.COLLECTION_NAME):
62
+ milvus_client.drop_collection(collection_name = MY_CONFIG.COLLECTION_NAME)
63
+ logger.info(f"✅ Cleared collection: {MY_CONFIG.COLLECTION_NAME}")
64
+
65
+ # Connect llama-index to vector db
66
+ vector_store = MilvusVectorStore(
67
+ uri = db_config['uri'],
68
+ dim = MY_CONFIG.EMBEDDING_LENGTH,
69
+ collection_name = MY_CONFIG.COLLECTION_NAME,
70
+ overwrite=True
71
+ )
72
+ storage_context = StorageContext.from_defaults(vector_store=vector_store)
73
+
74
+ # Save chunks into vector db
75
+ index = VectorStoreIndex(
76
+ nodes=nodes,
77
+ storage_context=storage_context,
78
+ )
79
+
80
+ logger.info(f"✅ Stored {len(nodes)} chunks in {db_config['name']}")
81
+ milvus_client.close()
82
+
83
+ logger.info("🎉 Both databases created!")
84
+ logger.info(f" • Vector RAG: {MY_CONFIG.MILVUS_URI_VECTOR}")
85
+ logger.info(f" • Hybrid GraphRAG: {MY_CONFIG.MILVUS_URI_HYBRID_GRAPH}")
3_save_to_vector_db_zilliz.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Cloud Vector Database Setup
3
+
4
+ Creates vector database collections on cloud infrastructure.
5
+ Supports both vector search and graph-based retrieval systems.
6
+ """
7
+
8
+ from my_config import MY_CONFIG
9
+ import os
10
+ import sys
11
+ from llama_index.core import SimpleDirectoryReader
12
+ from llama_index.core.node_parser import SentenceSplitter
13
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
14
+ from llama_index.core import Settings
15
+ from pymilvus import MilvusClient
16
+ from llama_index.core import StorageContext
17
+ from llama_index.vector_stores.milvus import MilvusVectorStore
18
+ from llama_index.core import VectorStoreIndex
19
+ import logging
20
+
21
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
22
+ logger = logging.getLogger(__name__)
23
+
24
+ # Validate cloud database configuration
25
+ if not MY_CONFIG.ZILLIZ_CLUSTER_ENDPOINT:
26
+ raise ValueError("Cloud endpoint configuration missing")
27
+ if not MY_CONFIG.ZILLIZ_TOKEN:
28
+ raise ValueError("Cloud authentication token missing")
29
+
30
+ def main():
31
+ logger.info("Initializing cloud database connection")
32
+
33
+ # Load source documents
34
+ logger.info("Loading documents")
35
+ reader = SimpleDirectoryReader(input_dir=MY_CONFIG.PROCESSED_DATA_DIR, recursive=False, required_exts=[".md"])
36
+ documents = reader.load_data()
37
+ logger.info(f"Loaded {len(documents)} documents")
38
+
39
+ # Process document chunks
40
+ logger.info("Processing document chunks")
41
+ parser = SentenceSplitter(chunk_size=MY_CONFIG.CHUNK_SIZE, chunk_overlap=MY_CONFIG.CHUNK_OVERLAP)
42
+ nodes = parser.get_nodes_from_documents(documents)
43
+ logger.info(f"Created {len(nodes)} chunks")
44
+
45
+ # Initialize embedding model
46
+ logger.info("Configuring embedding model")
47
+ os.environ['HF_ENDPOINT'] = MY_CONFIG.HF_ENDPOINT
48
+
49
+ Settings.embed_model = HuggingFaceEmbedding(
50
+ model_name=MY_CONFIG.EMBEDDING_MODEL
51
+ )
52
+
53
+ # Create cloud database collection
54
+ logger.info("Creating database collection")
55
+ collection_name = MY_CONFIG.COLLECTION_NAME
56
+
57
+ milvus_client = None
58
+ try:
59
+ # Connect to cloud database
60
+ milvus_client = MilvusClient(
61
+ uri=MY_CONFIG.ZILLIZ_CLUSTER_ENDPOINT,
62
+ token=MY_CONFIG.ZILLIZ_TOKEN
63
+ )
64
+
65
+ # Remove existing collection if present
66
+ if milvus_client.has_collection(collection_name=collection_name):
67
+ milvus_client.drop_collection(collection_name=collection_name)
68
+
69
+ # Initialize vector store
70
+ vector_store = MilvusVectorStore(
71
+ uri=MY_CONFIG.ZILLIZ_CLUSTER_ENDPOINT,
72
+ token=MY_CONFIG.ZILLIZ_TOKEN,
73
+ collection_name=collection_name,
74
+ dim=MY_CONFIG.EMBEDDING_LENGTH,
75
+ overwrite=True
76
+ )
77
+ storage_context = StorageContext.from_defaults(vector_store=vector_store)
78
+
79
+ # Store document vectors
80
+ logger.info(f"Processing {len(nodes)} document chunks")
81
+ VectorStoreIndex(
82
+ nodes=nodes,
83
+ storage_context=storage_context,
84
+ )
85
+
86
+ logger.info(f"Database collection '{collection_name}' created successfully")
87
+
88
+ except Exception as e:
89
+ logger.error(f"Failed to create collection: {str(e)}")
90
+ raise
91
+ finally:
92
+ if milvus_client:
93
+ milvus_client.close()
94
+
95
+ logger.info("Cloud database setup completed successfully")
96
+
97
+ if __name__ == "__main__":
98
+ try:
99
+ main()
100
+ sys.exit(0)
101
+ except KeyboardInterrupt:
102
+ logger.info("Operation cancelled by user")
103
+ sys.exit(1)
104
+ except Exception as e:
105
+ logger.error(f"Fatal error: {str(e)}")
106
+ sys.exit(1)
3b_save_to_graph_db.py ADDED
@@ -0,0 +1,1050 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+ import logging
4
+ import os
5
+ import sys
6
+ from typing import Any, Dict, Optional
7
+ from my_config import MY_CONFIG
8
+ from neo4j import GraphDatabase, Driver
9
+ from tqdm import tqdm
10
+ from fastmcp import FastMCP
11
+
12
+ logging.basicConfig(level=logging.INFO)
13
+ logger = logging.getLogger(__name__)
14
+
15
+ GRAPH_DATA_DIR = MY_CONFIG.GRAPH_DATA_DIR
16
+ GRAPH_DATA_FILE = os.path.join(GRAPH_DATA_DIR, "graph-data-final.json")
17
+
18
+ class Neo4jConnection:
19
+ def __init__(self):
20
+ self.uri = MY_CONFIG.NEO4J_URI
21
+ self.username = MY_CONFIG.NEO4J_USER
22
+ self.password = MY_CONFIG.NEO4J_PASSWORD
23
+ self.database = getattr(MY_CONFIG, "NEO4J_DATABASE", None)
24
+ if not self.uri:
25
+ raise ValueError("NEO4J_URI config is required")
26
+ if not self.username:
27
+ raise ValueError("NEO4J_USERNAME config is required")
28
+ if not self.password:
29
+ raise ValueError("NEO4J_PASSWORD config is required")
30
+ if not self.database:
31
+ raise ValueError("NEO4J_DATABASE config is required")
32
+ self.driver: Optional[Driver] = None
33
+
34
+ async def connect(self):
35
+ if self.driver is None:
36
+ try:
37
+ self.driver = GraphDatabase.driver(
38
+ self.uri,
39
+ auth=(self.username, self.password)
40
+ )
41
+
42
+ await asyncio.get_event_loop().run_in_executor(
43
+ None, self.driver.verify_connectivity
44
+ )
45
+ logger.info("Connected to Neo4j")
46
+
47
+ except Exception as e:
48
+ logger.error(f"Connection failed: {e}")
49
+ self.driver = None
50
+
51
+ async def disconnect(self):
52
+ if self.driver:
53
+ await asyncio.get_event_loop().run_in_executor(
54
+ None, self.driver.close
55
+ )
56
+ self.driver = None
57
+
58
+ async def execute_query(self, query: str, parameters: Optional[Dict[str, Any]] = None):
59
+ if not self.driver:
60
+ raise ConnectionError("Not connected to Neo4j database")
61
+
62
+ def run_query():
63
+ with self.driver.session(database=self.database) as session:
64
+ result = session.run(query, parameters or {})
65
+ records = [record.data() for record in result]
66
+ summary = result.consume()
67
+ return records, summary
68
+
69
+ return await asyncio.get_event_loop().run_in_executor(None, run_query)
70
+
71
+ neo4j_connection = Neo4jConnection()
72
+
73
+ app = FastMCP("Neo4j Graph Data Upload Server")
74
+
75
+ @app.tool()
76
+ async def execute_cypher(query: str, parameters: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
77
+ try:
78
+ if not neo4j_connection.driver:
79
+ await neo4j_connection.connect()
80
+ if not neo4j_connection.driver:
81
+ return {
82
+ "status": "error",
83
+ "error": "Unable to connect to Neo4j database",
84
+ "details": "Check connection settings and network connectivity"
85
+ }
86
+
87
+ records, summary = await neo4j_connection.execute_query(query, parameters)
88
+
89
+ return {
90
+ "status": "success",
91
+ "query": query,
92
+ "parameters": parameters or {},
93
+ "records": records,
94
+ "record_count": len(records),
95
+ "execution_time_ms": summary.result_available_after,
96
+ "summary": {
97
+ "query_type": summary.query_type,
98
+ "counters": dict(summary.counters) if summary.counters else {}
99
+ }
100
+ }
101
+
102
+ except Exception as e:
103
+ return {
104
+ "status": "error",
105
+ "query": query,
106
+ "error": str(e)
107
+ }
108
+
109
+
110
+ @app.tool()
111
+ async def get_database_schema() -> Dict[str, Any]:
112
+ try:
113
+ if not neo4j_connection.driver:
114
+ await neo4j_connection.connect()
115
+ if not neo4j_connection.driver:
116
+ return {
117
+ "status": "error",
118
+ "error": "Unable to connect to Neo4j database"
119
+ }
120
+
121
+ labels_records, _ = await neo4j_connection.execute_query("CALL db.labels()")
122
+ labels = [record["label"] for record in labels_records]
123
+
124
+ rel_records, _ = await neo4j_connection.execute_query("CALL db.relationshipTypes()")
125
+ relationships = [record["relationshipType"] for record in rel_records]
126
+
127
+ prop_records, _ = await neo4j_connection.execute_query("CALL db.propertyKeys()")
128
+ properties = [record["propertyKey"] for record in prop_records]
129
+
130
+ try:
131
+ constraint_records, _ = await neo4j_connection.execute_query("SHOW CONSTRAINTS")
132
+ constraints = [dict(record) for record in constraint_records]
133
+ except Exception:
134
+ constraints = []
135
+
136
+ try:
137
+ index_records, _ = await neo4j_connection.execute_query("SHOW INDEXES")
138
+ indexes = [dict(record) for record in index_records]
139
+ except Exception:
140
+ indexes = []
141
+
142
+ return {
143
+ "status": "success",
144
+ "schema": {
145
+ "node_labels": labels,
146
+ "relationship_types": relationships,
147
+ "property_keys": properties,
148
+ "constraints": constraints,
149
+ "indexes": indexes
150
+ }
151
+ }
152
+
153
+ except Exception as e:
154
+ return {
155
+ "status": "error",
156
+ "error": str(e)
157
+ }
158
+
159
+
160
+ @app.tool()
161
+ async def get_node_count(label: Optional[str] = None) -> Dict[str, Any]:
162
+ try:
163
+ if not neo4j_connection.driver:
164
+ await neo4j_connection.connect()
165
+ if not neo4j_connection.driver:
166
+ return {
167
+ "status": "error",
168
+ "error": "Unable to connect to Neo4j database"
169
+ }
170
+
171
+ if label:
172
+ query = f"MATCH (n:`{label}`) RETURN count(n) as count"
173
+ else:
174
+ query = "MATCH (n) RETURN count(n) as count"
175
+
176
+ records, _ = await neo4j_connection.execute_query(query)
177
+ count = records[0]["count"] if records else 0
178
+
179
+ return {
180
+ "status": "success",
181
+ "label": label,
182
+ "count": count
183
+ }
184
+
185
+ except Exception as e:
186
+ return {
187
+ "status": "error",
188
+ "error": str(e)
189
+ }
190
+
191
+
192
+ @app.tool()
193
+ async def get_relationship_count(relationship_type: Optional[str] = None) -> Dict[str, Any]:
194
+ try:
195
+ if not neo4j_connection.driver:
196
+ await neo4j_connection.connect()
197
+ if not neo4j_connection.driver:
198
+ return {
199
+ "status": "error",
200
+ "error": "Unable to connect to Neo4j database"
201
+ }
202
+
203
+ if relationship_type:
204
+ query = f"MATCH ()-[r:`{relationship_type}`]-() RETURN count(r) as count"
205
+ else:
206
+ query = "MATCH ()-[r]-() RETURN count(r) as count"
207
+
208
+ records, _ = await neo4j_connection.execute_query(query)
209
+ count = records[0]["count"] if records else 0
210
+
211
+ return {
212
+ "status": "success",
213
+ "relationship_type": relationship_type,
214
+ "count": count
215
+ }
216
+
217
+ except Exception as e:
218
+ return {
219
+ "status": "error",
220
+ "error": str(e)
221
+ }
222
+
223
+
224
+ @app.tool()
225
+ async def health_check() -> Dict[str, Any]:
226
+ try:
227
+ if not neo4j_connection.driver:
228
+ await neo4j_connection.connect()
229
+
230
+ if not neo4j_connection.driver:
231
+ return {
232
+ "status": "unhealthy",
233
+ "reason": "Unable to connect to Neo4j database",
234
+ "configuration": {
235
+ "uri": neo4j_connection.uri,
236
+ "database": neo4j_connection.database,
237
+ "username": neo4j_connection.username
238
+ }
239
+ }
240
+
241
+ # A simple query to test connectivity
242
+ records, _ = await neo4j_connection.execute_query("RETURN 1 as test")
243
+
244
+ if records and records[0]["test"] == 1:
245
+ return {
246
+ "status": "healthy",
247
+ "database": neo4j_connection.database,
248
+ "uri": neo4j_connection.uri,
249
+ "ssl_enabled": neo4j_connection.uri.startswith(('neo4j+s://', 'bolt+s://')),
250
+ "message": "Neo4j connection is working properly"
251
+ }
252
+ else:
253
+ return {
254
+ "status": "unhealthy",
255
+ "reason": "Query execution failed or returned unexpected results"
256
+ }
257
+
258
+ except Exception as e:
259
+ return {
260
+ "status": "unhealthy",
261
+ "reason": str(e)
262
+ }
263
+
264
+
265
+ async def clear_database_impl() -> Dict[str, Any]:
266
+ try:
267
+ if not neo4j_connection.driver:
268
+ await neo4j_connection.connect()
269
+ if not neo4j_connection.driver:
270
+ return {
271
+ "status": "error",
272
+ "error": "Unable to connect to Neo4j database"
273
+ }
274
+
275
+ node_count_query = "MATCH (n) RETURN count(n) as count"
276
+ rel_count_query = "MATCH ()-[r]->() RETURN count(r) as count"
277
+
278
+ node_records, _ = await neo4j_connection.execute_query(node_count_query)
279
+ rel_records, _ = await neo4j_connection.execute_query(rel_count_query)
280
+
281
+ nodes_before = node_records[0]["count"] if node_records else 0
282
+ rels_before = rel_records[0]["count"] if rel_records else 0
283
+
284
+ await neo4j_connection.execute_query("MATCH ()-[r]->() DELETE r")
285
+ await neo4j_connection.execute_query("MATCH (n) DELETE n")
286
+
287
+ print(f"✅ Cleared: {nodes_before} nodes, {rels_before} relationships")
288
+
289
+ return {
290
+ "status": "success",
291
+ "message": "Database cleared successfully",
292
+ "statistics": {
293
+ "nodes_removed": nodes_before,
294
+ "relationships_removed": rels_before
295
+ }
296
+ }
297
+
298
+ except Exception as e:
299
+ return {
300
+ "status": "error",
301
+ "error": str(e)
302
+ }
303
+
304
+
305
+ @app.tool()
306
+ async def clear_database() -> Dict[str, Any]:
307
+ return await clear_database_impl()
308
+
309
+
310
+ async def upload_graph_data_impl() -> Dict[str, Any]:
311
+ try:
312
+ if not neo4j_connection.driver:
313
+ await neo4j_connection.connect()
314
+ if not neo4j_connection.driver:
315
+ return {
316
+ "status": "error",
317
+ "error": "Unable to connect to Neo4j database"
318
+ }
319
+
320
+ clear_result = await clear_database_impl()
321
+ if clear_result["status"] != "success":
322
+ return clear_result
323
+
324
+ # Check if graph data file exists
325
+ if not os.path.exists(GRAPH_DATA_FILE):
326
+ return {
327
+ "status": "error",
328
+ "error": f"Graph data file not found: {GRAPH_DATA_FILE}"
329
+ }
330
+
331
+ with open(GRAPH_DATA_FILE, 'r', encoding='utf-8') as f:
332
+ graph_data = json.load(f)
333
+
334
+ if not isinstance(graph_data, dict) or 'nodes' not in graph_data:
335
+ return {
336
+ "status": "error",
337
+ "error": "Invalid graph data format. Expected JSON with 'nodes' array"
338
+ }
339
+
340
+ nodes = graph_data.get('nodes', [])
341
+ relationships = graph_data.get('relationships', [])
342
+ communities_data = graph_data.get('communities', {})
343
+ drift_metadata = graph_data.get('drift_search_metadata', {})
344
+ global_metadata = graph_data.get('metadata', {})
345
+ search_optimization = drift_metadata.get('search_optimization', {}) if drift_metadata else {}
346
+
347
+ communities_count = len(drift_metadata.get('community_search_index', {})) if drift_metadata else 0
348
+ drift_count = 1 if drift_metadata else 0
349
+ metadata_count = 1 if global_metadata else 0
350
+ optimization_count = 1 if search_optimization else 0
351
+ communities_metadata_count = 1 if communities_data else 0
352
+ drift_config_count = 1 if (drift_metadata and 'configuration' in drift_metadata) else 0
353
+ community_search_index_count = 1 if (drift_metadata and 'community_search_index' in drift_metadata) else 0
354
+ search_optimization_object_count = 1 if (drift_metadata and 'search_optimization' in drift_metadata) else 0
355
+ embeddings_object_count = 1 if (drift_metadata and 'community_search_index' in drift_metadata) else 0
356
+ embeddings_count = communities_count if (drift_metadata and 'community_search_index' in drift_metadata) else 0
357
+
358
+ total_items = (len(nodes) + len(relationships) + communities_count + drift_count +
359
+ metadata_count + optimization_count + communities_metadata_count +
360
+ drift_config_count + community_search_index_count +
361
+ search_optimization_object_count + embeddings_object_count + embeddings_count)
362
+
363
+ print(f"Processing: {len(nodes)} nodes, {len(relationships)} relationships, {communities_count} communities, {total_items - len(nodes) - len(relationships) - communities_count} metadata")
364
+
365
+ upload_stats = {
366
+ "nodes_processed": 0,
367
+ "nodes_created": 0,
368
+ "relationships_processed": 0,
369
+ "relationships_created": 0,
370
+ "communities_processed": 0,
371
+ "communities_created": 0,
372
+ "drift_metadata_created": 0,
373
+ "global_metadata_created": 0,
374
+ "search_optimization_created": 0,
375
+ "communities_metadata_created": 0,
376
+ "drift_config_created": 0,
377
+ "community_search_index_created": 0,
378
+ "search_optimization_object_created": 0,
379
+ "embeddings_object_created": 0,
380
+ "embeddings_created": 0,
381
+ "errors": []
382
+ }
383
+
384
+ with tqdm(total=len(nodes), desc="Nodes", unit="node", ncols=80, leave=False) as pbar:
385
+ for node in nodes:
386
+ try:
387
+ upload_stats["nodes_processed"] += 1
388
+
389
+ node_id = node['id']
390
+ labels = node['labels']
391
+ properties = node.get('properties', {})
392
+
393
+ # Create node with labels
394
+ labels_str = ':'.join([f"`{label}`" for label in labels])
395
+ query = f"MERGE (n:{labels_str} {{id: $id}}) SET n += $props RETURN n"
396
+
397
+ await neo4j_connection.execute_query(query, {
398
+ "id": node_id,
399
+ "props": properties
400
+ })
401
+
402
+ upload_stats["nodes_created"] += 1
403
+ pbar.update(1)
404
+
405
+ except Exception as e:
406
+ upload_stats["errors"].append(f"Node upload error: {str(e)}")
407
+ pbar.update(1)
408
+
409
+ with tqdm(total=len(relationships), desc="Relationships", unit="rel", ncols=80, leave=False) as pbar:
410
+ for rel in relationships:
411
+ upload_stats["relationships_processed"] += 1
412
+
413
+ start_node = rel['startNode']
414
+ end_node = rel['endNode']
415
+ rel_type = rel['type']
416
+
417
+ try:
418
+ query = f"""
419
+ MATCH (a {{id: $start_node}})
420
+ MATCH (b {{id: $end_node}})
421
+ CREATE (a)-[r:`{rel_type}`]->(b)
422
+ SET r += $props
423
+ RETURN r
424
+ """
425
+
426
+ await neo4j_connection.execute_query(query, {
427
+ "start_node": start_node,
428
+ "end_node": end_node,
429
+ "props": properties
430
+ })
431
+
432
+ upload_stats["relationships_created"] += 1
433
+ pbar.update(1)
434
+
435
+ except Exception as e:
436
+ error_msg = f"Relationship upload error for rel {rel}: {str(e)}"
437
+ logger.error(error_msg)
438
+ upload_stats["errors"].append(error_msg)
439
+ pbar.update(1)
440
+
441
+ if drift_metadata and 'community_search_index' in drift_metadata:
442
+ community_index = drift_metadata['community_search_index']
443
+
444
+ with tqdm(total=len(community_index), desc="Communities", unit="comm", ncols=80, leave=False) as pbar:
445
+ for comm_id, comm_data in community_index.items():
446
+ try:
447
+ upload_stats["communities_processed"] += 1
448
+
449
+ embeddings = comm_data.get('embeddings', {})
450
+ summary_embedding = embeddings.get('summary_embedding', [])
451
+ hyde_embeddings = embeddings.get('hyde_embeddings', [])
452
+
453
+ follow_up_templates_json = json.dumps(comm_data.get('follow_up_templates', {}))
454
+ hyde_embeddings_json = json.dumps(hyde_embeddings)
455
+
456
+ # Get statistics
457
+ stats = comm_data.get('statistics', {})
458
+
459
+ # Community properties with documented attributes from JSON
460
+ community_props = {
461
+ "id": comm_id,
462
+ "summary": comm_data.get('summary', ''),
463
+ "key_entities": comm_data.get('key_entities', []),
464
+ "member_count": stats.get('member_count', 0),
465
+ "member_ids": stats.get('member_ids', []),
466
+ "internal_edges": stats.get('internal_edges', 0),
467
+ "density": stats.get('density', 0.0),
468
+ "avg_degree": stats.get('avg_degree', 0.0),
469
+ "follow_up_templates": follow_up_templates_json,
470
+ "hyde_embeddings": hyde_embeddings_json
471
+ }
472
+
473
+ # Add summary embedding as List<Float> if available
474
+ if summary_embedding and isinstance(summary_embedding, list):
475
+ community_props["summary_embedding"] = summary_embedding
476
+ community_props["embedding_dimensions"] = len(summary_embedding)
477
+
478
+ # Create Community node
479
+ query = """
480
+ MERGE (c:Community {id: $id})
481
+ SET c += $props
482
+ RETURN c
483
+ """
484
+
485
+ await neo4j_connection.execute_query(query, {
486
+ "id": comm_id,
487
+ "props": community_props
488
+ })
489
+
490
+ upload_stats["communities_created"] += 1
491
+ pbar.update(1)
492
+
493
+ except Exception as e:
494
+ error_msg = f"Community upload error for {comm_id}: {str(e)}"
495
+ logger.error(error_msg)
496
+ upload_stats["errors"].append(error_msg)
497
+ pbar.update(1)
498
+
499
+ if drift_metadata:
500
+ try:
501
+ query_routing_config_json = json.dumps(drift_metadata.get('query_routing_config', {}))
502
+ performance_monitoring_json = json.dumps(drift_metadata.get('performance_monitoring', {}))
503
+ configuration_json = json.dumps(drift_metadata.get('configuration', {}))
504
+ community_search_index_json = json.dumps(drift_metadata.get('community_search_index', {}))
505
+ search_optimization_json = json.dumps(drift_metadata.get('search_optimization', {}))
506
+
507
+ # Build a compact embeddings object (per-community) to store on the DRIFT node
508
+ embeddings_per_community = {}
509
+ for _comm_id, _comm_data in drift_metadata.get('community_search_index', {}).items():
510
+ emb = _comm_data.get('embeddings')
511
+ if emb:
512
+ # Only keep summary and hyde to limit size
513
+ embeddings_per_community[_comm_id] = {
514
+ 'summary_embedding': emb.get('summary_embedding'),
515
+ 'hyde_embeddings': emb.get('hyde_embeddings')
516
+ }
517
+
518
+ embeddings_json = json.dumps(embeddings_per_community)
519
+
520
+ drift_props = {
521
+ "version": drift_metadata.get('version', '1.0'),
522
+ "generated_timestamp": drift_metadata.get('generated_timestamp', ''),
523
+ "query_routing_config": query_routing_config_json,
524
+ "performance_monitoring": performance_monitoring_json,
525
+ "configuration": configuration_json,
526
+ # Nested objects stored as JSON strings for direct inspection
527
+ "community_search_index": community_search_index_json,
528
+ "search_optimization": search_optimization_json,
529
+ "embeddings": embeddings_json,
530
+ "total_communities": len(drift_metadata.get('community_search_index', {}))
531
+ }
532
+
533
+ # Create single DRIFT metadata node
534
+ query = """
535
+ MERGE (d:DriftMetadata {version: $version})
536
+ SET d += $props
537
+ RETURN d
538
+ """
539
+
540
+ await neo4j_connection.execute_query(query, {
541
+ "version": drift_metadata.get('version', '1.0'),
542
+ "props": drift_props
543
+ })
544
+
545
+ upload_stats["drift_metadata_created"] = 1
546
+
547
+ except Exception as e:
548
+ error_msg = f"DRIFT metadata upload error: {str(e)}"
549
+ logger.error(error_msg)
550
+ upload_stats["errors"].append(error_msg)
551
+
552
+ if global_metadata:
553
+ try:
554
+ # Convert nested objects to JSON strings for Neo4j compatibility
555
+ recovery_stats_json = json.dumps(global_metadata.get('recovery_stats', {}))
556
+ member_extraction_stats_json = json.dumps(global_metadata.get('member_extraction_stats', {}))
557
+ community_detection_json = json.dumps(global_metadata.get('community_detection', {}))
558
+
559
+ metadata_props = {
560
+ "node_count": global_metadata.get('node_count', 0),
561
+ "relationship_count": global_metadata.get('relationship_count', 0),
562
+ "generated_at": global_metadata.get('generated_at', ''),
563
+ "generator": global_metadata.get('generator', ''),
564
+ "llm_provider": global_metadata.get('llm_provider', ''),
565
+ "model": global_metadata.get('model', ''),
566
+ "format_version": global_metadata.get('format_version', ''),
567
+ "last_updated": global_metadata.get('last_updated', ''),
568
+ "phase": global_metadata.get('phase', ''),
569
+ "entity_count": global_metadata.get('entity_count', global_metadata.get('node_count', 0)),
570
+ "community_count": global_metadata.get('community_count', 0),
571
+ "total_node_count": global_metadata.get('total_node_count', global_metadata.get('node_count', 0)),
572
+ "total_relationship_count": global_metadata.get('total_relationship_count', global_metadata.get('relationship_count', 0)),
573
+
574
+ # Complex nested objects as JSON strings
575
+ "recovery_stats": recovery_stats_json,
576
+ "member_extraction_stats": member_extraction_stats_json,
577
+ "community_detection": community_detection_json
578
+ }
579
+
580
+ # Create Global Metadata node
581
+ query = """
582
+ MERGE (m:GraphMetadata {generator: $generator})
583
+ SET m += $props
584
+ RETURN m
585
+ """
586
+
587
+ await neo4j_connection.execute_query(query, {
588
+ "generator": global_metadata.get('generator', 'unknown'),
589
+ "props": metadata_props
590
+ })
591
+
592
+ upload_stats["global_metadata_created"] = 1
593
+
594
+ except Exception as e:
595
+ error_msg = f"Global metadata upload error: {str(e)}"
596
+ logger.error(error_msg)
597
+ upload_stats["errors"].append(error_msg)
598
+
599
+ if search_optimization:
600
+ try:
601
+ optimization_props = {
602
+ "total_communities": search_optimization.get('total_communities', 0),
603
+ "avg_community_size": search_optimization.get('avg_community_size', 0.0),
604
+ "graph_density": search_optimization.get('graph_density', 0.0),
605
+ "total_nodes": search_optimization.get('total_nodes', 0),
606
+ "total_edges": search_optimization.get('total_edges', 0),
607
+ "max_primer_communities": search_optimization.get('max_primer_communities', 0)
608
+ }
609
+
610
+ query = """
611
+ MERGE (s:SearchOptimization {id: 'global'})
612
+ SET s += $props
613
+ RETURN s
614
+ """
615
+
616
+ await neo4j_connection.execute_query(query, {
617
+ "props": optimization_props
618
+ })
619
+
620
+ upload_stats["search_optimization_created"] = 1
621
+
622
+ except Exception as e:
623
+ error_msg = f"Search optimization upload error: {str(e)}"
624
+ logger.error(error_msg)
625
+ upload_stats["errors"].append(error_msg)
626
+
627
+ if communities_data:
628
+ try:
629
+ communities_props = {
630
+ "algorithm": communities_data.get('algorithm', ''),
631
+ "total_communities": communities_data.get('total_communities', 0),
632
+ "modularity_score": communities_data.get('modularity_score', 0.0),
633
+ "summaries": json.dumps(communities_data.get('summaries', {})),
634
+ "statistics": json.dumps(communities_data.get('statistics', {}))
635
+ }
636
+
637
+ query = """
638
+ MERGE (cm:CommunitiesMetadata {algorithm: $algorithm})
639
+ SET cm += $props
640
+ RETURN cm
641
+ """
642
+
643
+ await neo4j_connection.execute_query(query, {
644
+ "algorithm": communities_data.get('algorithm', 'unknown'),
645
+ "props": communities_props
646
+ })
647
+
648
+ upload_stats["communities_metadata_created"] = 1
649
+
650
+ except Exception as e:
651
+ error_msg = f"Communities metadata upload error: {str(e)}"
652
+ logger.error(error_msg)
653
+ upload_stats["errors"].append(error_msg)
654
+
655
+ if drift_metadata and 'configuration' in drift_metadata:
656
+ try:
657
+ config = drift_metadata['configuration']
658
+
659
+ drift_config_props = {
660
+ "max_iterations": config.get('max_iterations', 0),
661
+ "confidence_threshold": config.get('confidence_threshold', 0.0),
662
+ "top_k_communities": config.get('top_k_communities', 0),
663
+ "hyde_expansion_count": config.get('hyde_expansion_count', 0),
664
+ "termination_criteria": config.get('termination_criteria', ''),
665
+ "version": drift_metadata.get('version', '1.0'),
666
+ "generated_timestamp": drift_metadata.get('generated_timestamp', '')
667
+ }
668
+
669
+ query = """
670
+ MERGE (dc:DriftConfiguration {version: $version})
671
+ SET dc += $props
672
+ RETURN dc
673
+ """
674
+
675
+ await neo4j_connection.execute_query(query, {
676
+ "version": drift_metadata.get('version', '1.0'),
677
+ "props": drift_config_props
678
+ })
679
+
680
+ upload_stats["drift_config_created"] = 1
681
+
682
+ except Exception as e:
683
+ error_msg = f"DRIFT Configuration upload error: {str(e)}"
684
+ logger.error(error_msg)
685
+ upload_stats["errors"].append(error_msg)
686
+
687
+ if drift_metadata and 'community_search_index' in drift_metadata:
688
+ try:
689
+ community_search_index = drift_metadata['community_search_index']
690
+
691
+ search_index_props = {
692
+ "version": drift_metadata.get('version', '1.0'),
693
+ "total_communities": len(community_search_index),
694
+ "community_data": json.dumps(community_search_index),
695
+ "generated_timestamp": drift_metadata.get('generated_timestamp', ''),
696
+ "index_type": "community_search"
697
+ }
698
+
699
+ query = """
700
+ MERGE (csi:CommunitySearchIndex {version: $version})
701
+ SET csi += $props
702
+ RETURN csi
703
+ """
704
+
705
+ await neo4j_connection.execute_query(query, {
706
+ "version": drift_metadata.get('version', '1.0'),
707
+ "props": search_index_props
708
+ })
709
+
710
+ upload_stats["community_search_index_created"] = 1
711
+
712
+ except Exception as e:
713
+ error_msg = f"Community Search Index upload error: {str(e)}"
714
+ logger.error(error_msg)
715
+ upload_stats["errors"].append(error_msg)
716
+
717
+ if drift_metadata and 'search_optimization' in drift_metadata:
718
+ try:
719
+ search_opt_data = drift_metadata['search_optimization']
720
+
721
+ search_opt_props = {
722
+ "total_communities": search_opt_data.get('total_communities', 0),
723
+ "avg_community_size": search_opt_data.get('avg_community_size', 0.0),
724
+ "graph_density": search_opt_data.get('graph_density', 0.0),
725
+ "total_nodes": search_opt_data.get('total_nodes', 0),
726
+ "total_edges": search_opt_data.get('total_edges', 0),
727
+ "max_primer_communities": search_opt_data.get('max_primer_communities', 0),
728
+ "optimization_version": drift_metadata.get('version', '1.0')
729
+ }
730
+
731
+ query = """
732
+ MERGE (so:SearchOptimizationObject {optimization_version: $version})
733
+ SET so += $props
734
+ RETURN so
735
+ """
736
+
737
+ await neo4j_connection.execute_query(query, {
738
+ "version": drift_metadata.get('version', '1.0'),
739
+ "props": search_opt_props
740
+ })
741
+
742
+ upload_stats["search_optimization_object_created"] = 1
743
+
744
+ except Exception as e:
745
+ error_msg = f"Search Optimization object upload error: {str(e)}"
746
+ logger.error(error_msg)
747
+ upload_stats["errors"].append(error_msg)
748
+
749
+ if drift_metadata and 'community_search_index' in drift_metadata:
750
+ try:
751
+ community_index = drift_metadata['community_search_index']
752
+
753
+ total_embeddings = 0
754
+ total_dimensions = 0
755
+ embedding_communities = []
756
+
757
+ for comm_id, comm_data in community_index.items():
758
+ embeddings_data = comm_data.get('embeddings', {})
759
+ if embeddings_data:
760
+ total_embeddings += 1
761
+ if embeddings_data.get('summary_embedding'):
762
+ total_dimensions = len(embeddings_data.get('summary_embedding', []))
763
+ embedding_communities.append(comm_id)
764
+
765
+ # Create embeddings object properties
766
+ embeddings_obj_props = {
767
+ "total_embeddings": total_embeddings,
768
+ "embedding_dimensions": total_dimensions,
769
+ "embedding_computation": "computed via text-embedding-ada-002",
770
+ "communities_with_embeddings": embedding_communities,
771
+ "embedding_type": "community_summaries",
772
+ "embeddings_version": drift_metadata.get('version', '1.0')
773
+ }
774
+
775
+ query = """
776
+ MERGE (eo:EmbeddingsObject {embeddings_version: $version})
777
+ SET eo += $props
778
+ RETURN eo
779
+ """
780
+
781
+ await neo4j_connection.execute_query(query, {
782
+ "version": drift_metadata.get('version', '1.0'),
783
+ "props": embeddings_obj_props
784
+ })
785
+
786
+ upload_stats["embeddings_object_created"] = 1
787
+
788
+ except Exception as e:
789
+ error_msg = f"Embeddings object upload error: {str(e)}"
790
+ logger.error(error_msg)
791
+ upload_stats["errors"].append(error_msg)
792
+
793
+ if drift_metadata and 'community_search_index' in drift_metadata:
794
+ community_index = drift_metadata['community_search_index']
795
+
796
+ for comm_id, comm_data in community_index.items():
797
+ try:
798
+ embeddings_data = comm_data.get('embeddings', {})
799
+ if embeddings_data:
800
+ embeddings_props = {
801
+ "community_id": comm_id,
802
+ "summary_embedding": embeddings_data.get('summary_embedding', []),
803
+ "hyde_embeddings": json.dumps(embeddings_data.get('hyde_embeddings', [])),
804
+ "embedding_dimensions": len(embeddings_data.get('summary_embedding', [])),
805
+ "embedding_computation": embeddings_data.get('embedding_computation', 'computed')
806
+ }
807
+
808
+ query = """
809
+ MERGE (e:Embeddings {community_id: $community_id})
810
+ SET e += $props
811
+ RETURN e
812
+ """
813
+
814
+ await neo4j_connection.execute_query(query, {
815
+ "community_id": comm_id,
816
+ "props": embeddings_props
817
+ })
818
+
819
+ upload_stats["embeddings_created"] += 1
820
+
821
+ except Exception as e:
822
+ error_msg = f"Embeddings upload error for {comm_id}: {str(e)}"
823
+ logger.error(error_msg)
824
+ upload_stats["errors"].append(error_msg)
825
+
826
+ if communities_count > 0:
827
+ try:
828
+ # Connect entities to their communities based on community_id property
829
+ community_rel_query = """
830
+ MATCH (n) WHERE n.community_id IS NOT NULL
831
+ MATCH (c:Community {id: n.community_id})
832
+ MERGE (n)-[:BELONGS_TO_COMMUNITY]->(c)
833
+ """
834
+ await neo4j_connection.execute_query(community_rel_query, {})
835
+
836
+ except Exception as e:
837
+ error_msg = f"Community relationship creation error: {str(e)}"
838
+ logger.error(error_msg)
839
+ upload_stats["errors"].append(error_msg)
840
+
841
+ # Calculate success percentage for all components
842
+ nodes_success_rate = (upload_stats["nodes_created"] / len(nodes) * 100) if nodes else 100
843
+ rels_success_rate = (upload_stats["relationships_created"] / len(relationships) * 100) if relationships else 100
844
+ communities_success_rate = (upload_stats["communities_created"] / communities_count * 100) if communities_count else 100
845
+ drift_success_rate = (upload_stats["drift_metadata_created"] / drift_count * 100) if drift_count else 100
846
+
847
+ embedding_dimensions = 0
848
+ if drift_metadata and 'community_search_index' in drift_metadata:
849
+ for comm_data in drift_metadata['community_search_index'].values():
850
+ embeddings = comm_data.get('embeddings', {})
851
+ summary_embedding = embeddings.get('summary_embedding', [])
852
+ if summary_embedding:
853
+ embedding_dimensions = len(summary_embedding)
854
+ break
855
+
856
+ total_created = (upload_stats["nodes_created"] + upload_stats["relationships_created"] +
857
+ upload_stats["communities_created"] + upload_stats["drift_metadata_created"] +
858
+ upload_stats["global_metadata_created"] + upload_stats["search_optimization_created"] +
859
+ upload_stats["communities_metadata_created"] + upload_stats["drift_config_created"] +
860
+ upload_stats["community_search_index_created"] + upload_stats["search_optimization_object_created"] +
861
+ upload_stats["embeddings_object_created"] + upload_stats["embeddings_created"])
862
+ overall_success_rate = (total_created / total_items * 100) if total_items else 100
863
+
864
+ result = {
865
+ "status": "success",
866
+ "message": "Graph data upload completed successfully",
867
+ "statistics": upload_stats,
868
+ "success_rates": {
869
+ "nodes": f"{nodes_success_rate:.1f}%",
870
+ "relationships": f"{rels_success_rate:.1f}%",
871
+ "communities": f"{communities_success_rate:.1f}%",
872
+ "drift_metadata": f"{drift_success_rate:.1f}%",
873
+ "global_metadata": f"{100.0 if upload_stats['global_metadata_created'] > 0 else 0:.1f}%",
874
+ "search_optimization": f"{100.0 if upload_stats['search_optimization_created'] > 0 else 0:.1f}%",
875
+ "communities_metadata": f"{100.0 if upload_stats['communities_metadata_created'] > 0 else 0:.1f}%",
876
+ "drift_config": f"{100.0 if upload_stats['drift_config_created'] > 0 else 0:.1f}%",
877
+ "community_search_index": f"{100.0 if upload_stats['community_search_index_created'] > 0 else 0:.1f}%",
878
+ "search_optimization_object": f"{100.0 if upload_stats['search_optimization_object_created'] > 0 else 0:.1f}%",
879
+ "embeddings_object": f"{100.0 if upload_stats['embeddings_object_created'] > 0 else 0:.1f}%",
880
+ "embeddings": f"{(upload_stats['embeddings_created']/communities_count*100) if communities_count > 0 else 0:.1f}%",
881
+ "overall": f"{overall_success_rate:.1f}%"
882
+ },
883
+ "source_file": GRAPH_DATA_FILE,
884
+ "architecture_summary": {
885
+ "nodes": f"{upload_stats['nodes_created']}/{len(nodes)}",
886
+ "relationships": f"{upload_stats['relationships_created']}/{len(relationships)}",
887
+ "communities": f"{upload_stats['communities_created']}/{communities_count}",
888
+ "drift_metadata": f"{upload_stats['drift_metadata_created']}/{drift_count}",
889
+ "global_metadata": f"{upload_stats['global_metadata_created']}/1",
890
+ "search_optimization": f"{upload_stats['search_optimization_created']}/1",
891
+ "embeddings_stored": communities_count > 0,
892
+ "vector_dimensions": embedding_dimensions,
893
+ "complete_metadata_coverage": upload_stats['global_metadata_created'] > 0 and upload_stats['search_optimization_created'] > 0
894
+ }
895
+ }
896
+
897
+ # Print concise upload summary
898
+ print(f"\n✅ Upload completed: {total_created}/{total_items} items ({overall_success_rate:.1f}%)")
899
+
900
+ # Show all node types created
901
+ total_entity_nodes = upload_stats['nodes_created']
902
+ total_metadata_nodes = (upload_stats['drift_metadata_created'] +
903
+ upload_stats['global_metadata_created'] +
904
+ upload_stats['search_optimization_created'] +
905
+ upload_stats['communities_metadata_created'] +
906
+ upload_stats['drift_config_created'] +
907
+ upload_stats['community_search_index_created'] +
908
+ upload_stats['search_optimization_object_created'] +
909
+ upload_stats['embeddings_object_created'])
910
+
911
+ print(f" Entity Nodes: {total_entity_nodes}, Community Nodes: {upload_stats['communities_created']}, Metadata Nodes: {total_metadata_nodes}, Embedding Nodes: {upload_stats['embeddings_created']}")
912
+ print(f" Relationships: {upload_stats['relationships_created']}")
913
+
914
+ if upload_stats['errors']:
915
+ print(f" ⚠️ {len(upload_stats['errors'])} errors encountered")
916
+
917
+ return result
918
+
919
+ except Exception as e:
920
+ logger.error(f"Graph data upload failed: {str(e)}")
921
+ return {
922
+ "status": "error",
923
+ "error": str(e)
924
+ }
925
+
926
+
927
+ @app.tool()
928
+ async def upload_graph_data() -> Dict[str, Any]:
929
+ return await upload_graph_data_impl()
930
+
931
+
932
+ @app.tool()
933
+ async def check_graph_data_file() -> Dict[str, Any]:
934
+ try:
935
+ if not os.path.exists(GRAPH_DATA_FILE):
936
+ return {
937
+ "status": "not_found",
938
+ "path": GRAPH_DATA_FILE,
939
+ "message": "Graph data file does not exist"
940
+ }
941
+
942
+ # Get file stats
943
+ file_stats = os.stat(GRAPH_DATA_FILE)
944
+ file_size = file_stats.st_size
945
+
946
+ # Try to parse the JSON to validate format
947
+ try:
948
+ with open(GRAPH_DATA_FILE, 'r', encoding='utf-8') as f:
949
+ graph_data = json.load(f)
950
+
951
+ nodes_count = len(graph_data.get('nodes', []))
952
+ relationships_count = len(graph_data.get('relationships', []))
953
+
954
+ return {
955
+ "status": "found",
956
+ "path": GRAPH_DATA_FILE,
957
+ "file_size_bytes": file_size,
958
+ "nodes_count": nodes_count,
959
+ "relationships_count": relationships_count,
960
+ "valid_json": True
961
+ }
962
+
963
+ except json.JSONDecodeError as e:
964
+ return {
965
+ "status": "invalid",
966
+ "path": GRAPH_DATA_FILE,
967
+ "file_size_bytes": file_size,
968
+ "valid_json": False,
969
+ "json_error": str(e)
970
+ }
971
+
972
+ except Exception as e:
973
+ return {
974
+ "status": "error",
975
+ "error": str(e),
976
+ "error_type": type(e).__name__
977
+ }
978
+
979
+
980
+ @app.tool()
981
+ async def get_connection_info() -> Dict[str, Any]:
982
+ try:
983
+ # Always return configuration info even if not connected
984
+ deployment_type = "Self-hosted"
985
+ if "databases.neo4j.io" in neo4j_connection.uri:
986
+ deployment_type = "Neo4j Aura"
987
+ elif "sandbox" in neo4j_connection.uri:
988
+ deployment_type = "Neo4j Sandbox"
989
+ elif any(cloud in neo4j_connection.uri for cloud in ["aws", "gcp", "azure"]):
990
+ deployment_type = "Enterprise Cloud"
991
+
992
+ connection_info = {
993
+ "status": "success",
994
+ "connection": {
995
+ "uri": neo4j_connection.uri,
996
+ "database": neo4j_connection.database,
997
+ "username": neo4j_connection.username,
998
+ "deployment_type": deployment_type,
999
+ "ssl_enabled": neo4j_connection.uri.startswith(('neo4j+s://', 'bolt+s://')),
1000
+ "connected": neo4j_connection.driver is not None
1001
+ },
1002
+ "capabilities": {
1003
+ "cypher_queries": True,
1004
+ "schema_inspection": True,
1005
+ "bulk_operations": True,
1006
+ "graph_algorithms": "unknown",
1007
+ "multi_database": "unknown"
1008
+ }
1009
+ }
1010
+
1011
+ if neo4j_connection.driver:
1012
+ try:
1013
+ server_info_records, _ = await neo4j_connection.execute_query(
1014
+ "CALL dbms.components() YIELD name, versions, edition"
1015
+ )
1016
+ connection_info["server_info"] = server_info_records[0] if server_info_records else {}
1017
+ except Exception:
1018
+ connection_info["server_info"] = {}
1019
+
1020
+ return connection_info
1021
+
1022
+ except Exception as e:
1023
+ logger.error(f"Connection info retrieval failed: {str(e)}")
1024
+ return {
1025
+ "status": "error",
1026
+ "error": str(e),
1027
+ "error_type": type(e).__name__
1028
+ }
1029
+
1030
+
1031
+ if __name__ == "__main__":
1032
+ import sys
1033
+ try:
1034
+ asyncio.run(neo4j_connection.connect())
1035
+ print(f"Looking for graph data at: {GRAPH_DATA_FILE}")
1036
+ print(f"File exists: {os.path.exists(GRAPH_DATA_FILE)}")
1037
+
1038
+ result = asyncio.run(upload_graph_data_impl())
1039
+ print(f"Upload result: {result.get('status', 'unknown')}")
1040
+
1041
+ if result.get('status') == 'error':
1042
+ print(f"❌ Error details: {result.get('error', 'Unknown error')}")
1043
+ if 'error_type' in result:
1044
+ print(f"Error type: {result['error_type']}")
1045
+
1046
+ except ValueError as e:
1047
+ logger.error(f"Configuration Error: {e}")
1048
+ sys.exit(1)
1049
+ except Exception as e:
1050
+ logger.warning(f"Connection Warning: {e}")
4_query copy.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from my_config import MY_CONFIG
3
+
4
+ # If connection to https://huggingface.co/ failed, uncomment the following path
5
+ os.environ['HF_ENDPOINT'] = MY_CONFIG.HF_ENDPOINT
6
+
7
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
8
+ from llama_index.core import Settings
9
+ from llama_index.core import VectorStoreIndex, StorageContext
10
+ from llama_index.vector_stores.milvus import MilvusVectorStore
11
+ from llama_index.core import VectorStoreIndex
12
+ from dotenv import load_dotenv
13
+ from llama_index.llms.litellm import LiteLLM
14
+ import query_utils
15
+ import time
16
+ import logging
17
+ import json
18
+
19
+ logging.basicConfig(
20
+ level=logging.INFO,
21
+ format='%(asctime)s - %(levelname)s - %(message)s',
22
+ handlers=[
23
+ logging.FileHandler('logs/query/query_log.txt', mode='a'), # Save to file
24
+ logging.StreamHandler() # Also show in console
25
+ ],
26
+ force=True
27
+ )
28
+ logger = logging.getLogger(__name__)
29
+ logger.setLevel(logging.INFO)
30
+
31
+
32
+ def run_query(query: str):
33
+ global query_engine
34
+ logger.info (f"-----------------------------------")
35
+ start_time = time.time()
36
+ query = query_utils.tweak_query(query, MY_CONFIG.LLM_MODEL)
37
+ logger.info (f"\nProcessing Query:\n{query}")
38
+
39
+ # Get initial vector response
40
+ vector_response = query_engine.query(query)
41
+ vector_text = str(vector_response).strip()
42
+
43
+ # Structured prompt
44
+ structured_prompt = f"""Please provide a comprehensive, well-structured answer using the provided document information.
45
+
46
+ Question: {query}
47
+
48
+ Document Information:
49
+ {vector_text}
50
+
51
+ Instructions:
52
+ 1. Provide accurate, factual information based on the documents
53
+ 2. Structure your response clearly with proper formatting
54
+ 3. Be comprehensive yet concise
55
+ 4. Highlight key relationships and important details when relevant
56
+ 5. Use bullet points or sections when appropriate for clarity
57
+
58
+ Please provide your answer:"""
59
+
60
+ # Use structured prompt for final synthesis
61
+ res = query_engine.query(structured_prompt)
62
+
63
+ end_time = time.time()
64
+ total_time = end_time - start_time
65
+ logger.info ( "-------"
66
+ + f"\nResponse:\n{res}"
67
+ + f"\n\n⏱️ Total time: {total_time:.1f} seconds"
68
+ + f"\n\nResponse Metadata:\n{json.dumps(res.metadata, indent=2)}"
69
+ # + f"\nSource Nodes: {[node.node_id for node in res.source_nodes]}"
70
+ )
71
+ logger.info (f"-----------------------------------")
72
+
73
+ # Save response and metadata to files
74
+ _save_query_files(query, res, total_time)
75
+
76
+ return res
77
+
78
+ def _save_query_files(query: str, response, total_time: float):
79
+ """Save query response and metadata to files."""
80
+ import time
81
+ timestamp = time.strftime('%Y-%m-%d %H:%M:%S')
82
+
83
+ try:
84
+ # Save response to file
85
+ with open('logs/query/query_responses.txt', 'a', encoding='utf-8') as f:
86
+ f.write(f"\n{'='*80}\n")
87
+ f.write(f"QUERY [{timestamp}]: {query}\n")
88
+ f.write(f"{'='*80}\n")
89
+ f.write(f"RESPONSE: {response}\n")
90
+ f.write(f"TIME: {total_time:.1f} seconds\n")
91
+ f.write(f"{'='*80}\n\n")
92
+
93
+ # Save metadata to file
94
+ with open('logs/query/query_metadata.txt', 'a', encoding='utf-8') as f:
95
+ f.write(f"\n{'='*80}\n")
96
+ f.write(f"METADATA [{timestamp}]: {query}\n")
97
+ f.write(f"{'='*80}\n")
98
+ f.write(f"TIME: {total_time:.1f} seconds\n")
99
+ f.write(json.dumps(response.metadata, indent=2, default=str))
100
+ f.write(f"\n{'='*80}\n\n")
101
+
102
+ logger.info(f"Saved response and metadata for query: {query[:50]}...")
103
+ except Exception as e:
104
+ logger.error(f"Failed to save query files: {e}")
105
+
106
+ ## ======= end : run_query =======
107
+
108
+ ## load env config
109
+ load_dotenv()
110
+
111
+ # Setup embeddings
112
+ Settings.embed_model = HuggingFaceEmbedding(
113
+ model_name = MY_CONFIG.EMBEDDING_MODEL
114
+ )
115
+ logger.info (f"✅ Using embedding model: {MY_CONFIG.EMBEDDING_MODEL}")
116
+
117
+ # Connect to Vector RAG only database
118
+ vector_store = MilvusVectorStore(
119
+ uri = MY_CONFIG.MILVUS_URI_VECTOR, # Use dedicated Vector-only database
120
+ dim = MY_CONFIG.EMBEDDING_LENGTH,
121
+ collection_name = MY_CONFIG.COLLECTION_NAME,
122
+ overwrite=False # so we load the index from db
123
+ )
124
+ storage_context = StorageContext.from_defaults(vector_store=vector_store)
125
+ logger.info (f"✅ Connected to Vector-only Milvus instance: {MY_CONFIG.MILVUS_URI_VECTOR}")
126
+
127
+ # Load Document Index from DB
128
+
129
+ index = VectorStoreIndex.from_vector_store(
130
+ vector_store=vector_store, storage_context=storage_context)
131
+ logger.info (f"✅ Loaded Vector-only index from: {MY_CONFIG.MILVUS_URI_VECTOR}")
132
+
133
+ # Setup LLM
134
+ logger.info (f"✅ Using LLM model : {MY_CONFIG.LLM_MODEL}")
135
+ Settings.llm = LiteLLM (
136
+ model=MY_CONFIG.LLM_MODEL,
137
+ )
138
+
139
+ query_engine = index.as_query_engine()
140
+
141
+ # Sample queries
142
+ queries = [
143
+ # "What is AI Alliance?",
144
+ # "What are the main focus areas of AI Alliance?",
145
+ # "What are some ai alliance projects?",
146
+ # "What are the upcoming events?",
147
+ # "How do I join the AI Alliance?",
148
+ # "When was the moon landing?",
149
+ ]
150
+
151
+ for query in queries:
152
+ run_query(query)
153
+
154
+ logger.info (f"-----------------------------------")
155
+
156
+ while True:
157
+ # Get user input
158
+ user_query = input("\nEnter your question (or 'q' to exit): ")
159
+
160
+ # Check if user wants to quit
161
+ if user_query.lower() in ['quit', 'exit', 'q']:
162
+ logger.info ("Goodbye!")
163
+ break
164
+
165
+ # Process the query
166
+ if user_query.strip() == "":
167
+ continue
168
+
169
+ try:
170
+ run_query(user_query)
171
+ except Exception as e:
172
+ logger.error(f"Error processing query: {e}")
173
+ print(f"Error processing query: {e}")
4_query.ipynb ADDED
@@ -0,0 +1,398 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# RAG on HTML documents\n"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "markdown",
12
+ "metadata": {},
13
+ "source": [
14
+ "## Step-1: Configuration"
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": 1,
20
+ "metadata": {},
21
+ "outputs": [],
22
+ "source": [
23
+ "from my_config import MY_CONFIG"
24
+ ]
25
+ },
26
+ {
27
+ "cell_type": "markdown",
28
+ "metadata": {},
29
+ "source": [
30
+ "## Step-2: Setup Embeddings"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": 2,
36
+ "metadata": {},
37
+ "outputs": [],
38
+ "source": [
39
+ "# If connection to https://huggingface.co/ failed, uncomment the following path\n",
40
+ "import os\n",
41
+ "os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'"
42
+ ]
43
+ },
44
+ {
45
+ "cell_type": "code",
46
+ "execution_count": 3,
47
+ "metadata": {},
48
+ "outputs": [
49
+ {
50
+ "name": "stderr",
51
+ "output_type": "stream",
52
+ "text": [
53
+ "/home/sujee/my-stuff/projects/ai-alliance/allycat-1/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
54
+ " from .autonotebook import tqdm as notebook_tqdm\n"
55
+ ]
56
+ }
57
+ ],
58
+ "source": [
59
+ "from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n",
60
+ "from llama_index.core import Settings\n",
61
+ "\n",
62
+ "Settings.embed_model = HuggingFaceEmbedding(\n",
63
+ " model_name = MY_CONFIG.EMBEDDING_MODEL\n",
64
+ ")"
65
+ ]
66
+ },
67
+ {
68
+ "cell_type": "markdown",
69
+ "metadata": {},
70
+ "source": [
71
+ "## Step-3: Connect to Milvus"
72
+ ]
73
+ },
74
+ {
75
+ "cell_type": "code",
76
+ "execution_count": 4,
77
+ "metadata": {},
78
+ "outputs": [
79
+ {
80
+ "name": "stderr",
81
+ "output_type": "stream",
82
+ "text": [
83
+ "/home/sujee/my-stuff/projects/ai-alliance/allycat-1/.venv/lib/python3.11/site-packages/milvus_lite/__init__.py:15: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.\n",
84
+ " from pkg_resources import DistributionNotFound, get_distribution\n",
85
+ "2025-07-14 00:23:38,214 [DEBUG][_create_connection]: Created new connection using: async-workspace/rag_website_milvus.db (async_milvus_client.py:599)\n"
86
+ ]
87
+ },
88
+ {
89
+ "name": "stdout",
90
+ "output_type": "stream",
91
+ "text": [
92
+ "✅ Connected to Milvus instance: workspace/rag_website_milvus.db\n"
93
+ ]
94
+ }
95
+ ],
96
+ "source": [
97
+ "# connect to vector db\n",
98
+ "from llama_index.core import VectorStoreIndex, StorageContext\n",
99
+ "from llama_index.vector_stores.milvus import MilvusVectorStore\n",
100
+ "\n",
101
+ "vector_store = MilvusVectorStore(\n",
102
+ " uri = MY_CONFIG.DB_URI ,\n",
103
+ " dim = MY_CONFIG.EMBEDDING_LENGTH , \n",
104
+ " collection_name = MY_CONFIG.COLLECTION_NAME,\n",
105
+ " overwrite=False # so we load the index from db\n",
106
+ ")\n",
107
+ "storage_context = StorageContext.from_defaults(vector_store=vector_store)\n",
108
+ "\n",
109
+ "print (\"✅ Connected to Milvus instance: \", MY_CONFIG.DB_URI )"
110
+ ]
111
+ },
112
+ {
113
+ "cell_type": "markdown",
114
+ "metadata": {},
115
+ "source": [
116
+ "## Step-4: Load Document Index from DB"
117
+ ]
118
+ },
119
+ {
120
+ "cell_type": "code",
121
+ "execution_count": 5,
122
+ "metadata": {},
123
+ "outputs": [
124
+ {
125
+ "name": "stdout",
126
+ "output_type": "stream",
127
+ "text": [
128
+ "✅ Loaded index from vector db: workspace/rag_website_milvus.db\n",
129
+ "CPU times: user 109 ms, sys: 16.8 ms, total: 126 ms\n",
130
+ "Wall time: 123 ms\n"
131
+ ]
132
+ }
133
+ ],
134
+ "source": [
135
+ "%%time\n",
136
+ "\n",
137
+ "from llama_index.core import VectorStoreIndex\n",
138
+ "\n",
139
+ "index = VectorStoreIndex.from_vector_store(\n",
140
+ " vector_store=vector_store, storage_context=storage_context)\n",
141
+ "\n",
142
+ "print (\"✅ Loaded index from vector db:\", MY_CONFIG.DB_URI )"
143
+ ]
144
+ },
145
+ {
146
+ "cell_type": "markdown",
147
+ "metadata": {},
148
+ "source": [
149
+ "## Step-5: Setup LLM"
150
+ ]
151
+ },
152
+ {
153
+ "cell_type": "code",
154
+ "execution_count": 6,
155
+ "metadata": {},
156
+ "outputs": [
157
+ {
158
+ "name": "stdout",
159
+ "output_type": "stream",
160
+ "text": [
161
+ "✅ Using LLM model : ollama/gemma3:1b\n"
162
+ ]
163
+ }
164
+ ],
165
+ "source": [
166
+ "from llama_index.llms.litellm import LiteLLM\n",
167
+ "\n",
168
+ "# Setup LLM\n",
169
+ "print (f\"✅ Using LLM model : {MY_CONFIG.LLM_MODEL}\")\n",
170
+ "Settings.llm = LiteLLM (\n",
171
+ " model=MY_CONFIG.LLM_MODEL,\n",
172
+ " )"
173
+ ]
174
+ },
175
+ {
176
+ "cell_type": "markdown",
177
+ "metadata": {},
178
+ "source": [
179
+ "## Step-6: Query"
180
+ ]
181
+ },
182
+ {
183
+ "cell_type": "code",
184
+ "execution_count": 7,
185
+ "metadata": {},
186
+ "outputs": [
187
+ {
188
+ "name": "stdout",
189
+ "output_type": "stream",
190
+ "text": [
191
+ "The AI Alliance is an international community of researchers, developers, and organizational leaders committed to fostering open innovation across the AI technology landscape to accelerate progress, improve safety, security, diversity and economic competitiveness in AI.\n"
192
+ ]
193
+ }
194
+ ],
195
+ "source": [
196
+ "import query_utils\n",
197
+ "\n",
198
+ "query_engine = index.as_query_engine()\n",
199
+ "query = query_utils.tweak_query('What is AI Alliance?', MY_CONFIG.LLM_MODEL)\n",
200
+ "res = query_engine.query(query)\n",
201
+ "print(res)"
202
+ ]
203
+ },
204
+ {
205
+ "cell_type": "code",
206
+ "execution_count": 8,
207
+ "metadata": {},
208
+ "outputs": [
209
+ {
210
+ "name": "stdout",
211
+ "output_type": "stream",
212
+ "text": [
213
+ "The AI Alliance is focused on fostering an open community and enabling developers and researchers to accelerate responsible innovation in AI while ensuring scientific rigor, trust, safety, security, diversity and economic competitiveness.\n"
214
+ ]
215
+ }
216
+ ],
217
+ "source": [
218
+ "query_engine = index.as_query_engine()\n",
219
+ "query = query_utils.tweak_query('What are the main focus areas of AI Alliance?', MY_CONFIG.LLM_MODEL)\n",
220
+ "res = query_engine.query(query)\n",
221
+ "print(res)"
222
+ ]
223
+ },
224
+ {
225
+ "cell_type": "code",
226
+ "execution_count": 9,
227
+ "metadata": {},
228
+ "outputs": [
229
+ {
230
+ "name": "stdout",
231
+ "output_type": "stream",
232
+ "text": [
233
+ "Based on the provided text, here are some of the AI Alliance projects mentioned:\n",
234
+ "\n",
235
+ "* FPT Software\n",
236
+ "* Hebrew University of Jerusalem\n",
237
+ "* Hugging Face\n",
238
+ "* IBM\n",
239
+ "* Abdus Salam International Centre for Theoretical Physics (ICTP)\n",
240
+ "* Imperial College London\n",
241
+ "* Indian Institute of Technology Bombay\n",
242
+ "* Institute for Computer Science, Artificial Intelligence\n",
243
+ "* Intel\n",
244
+ "* Keio University\n",
245
+ "* LangChain\n",
246
+ "* LlamaIndex\n",
247
+ "* Linux Foundation\n",
248
+ "* Mass Open Cloud Alliance, operated by Boston University and Harvard\n",
249
+ "* Meta\n",
250
+ "* Mohamed bin Zayed University of Artificial Intelligence\n",
251
+ "* MLCommons\n",
252
+ "* National Aeronautics and Space Administration\n",
253
+ "* National Science Foundation\n",
254
+ "* New York University\n",
255
+ "* NumFOCUS\n",
256
+ "* OpenTeams\n",
257
+ "* Oracle\n",
258
+ "* Partnership on AI\n",
259
+ "* Quansight\n",
260
+ "* Red Hat\n",
261
+ "* Rensselaer Polytechnic Institute\n",
262
+ "* Roadzen\n",
263
+ "* Sakana AI\n",
264
+ "* SB Intuitions\n",
265
+ "* ServiceNow\n",
266
+ "* Silo AI\n",
267
+ "* Simons Foundation\n",
268
+ "* Sony Group\n",
269
+ "* Stability AI\n",
270
+ "* Together AI\n",
271
+ "* TU Munich\n",
272
+ "* UC Berkeley College of Computing, Data Science, and Society\n",
273
+ "* University of Illinois Urbana-Champaign\n",
274
+ "* The University of Notre Dame\n",
275
+ "* The University of Texas at Austin\n",
276
+ "* The University of Tokyo\n"
277
+ ]
278
+ }
279
+ ],
280
+ "source": [
281
+ "query_engine = index.as_query_engine()\n",
282
+ "query = query_utils.tweak_query('What are some ai alliance projects?', MY_CONFIG.LLM_MODEL)\n",
283
+ "res = query_engine.query(query)\n",
284
+ "print(res)"
285
+ ]
286
+ },
287
+ {
288
+ "cell_type": "code",
289
+ "execution_count": 10,
290
+ "metadata": {},
291
+ "outputs": [
292
+ {
293
+ "name": "stdout",
294
+ "output_type": "stream",
295
+ "text": [
296
+ "On August 8th, The AI Alliance hosted Open Source AI Demo Night in San Francisco.\n"
297
+ ]
298
+ }
299
+ ],
300
+ "source": [
301
+ "query_engine = index.as_query_engine()\n",
302
+ "query = query_utils.tweak_query('Where was the demo night held?', MY_CONFIG.LLM_MODEL)\n",
303
+ "res = query_engine.query(query)\n",
304
+ "print(res)"
305
+ ]
306
+ },
307
+ {
308
+ "cell_type": "code",
309
+ "execution_count": 11,
310
+ "metadata": {},
311
+ "outputs": [
312
+ {
313
+ "name": "stdout",
314
+ "output_type": "stream",
315
+ "text": [
316
+ "The AI Alliance is focused on developing and sharing foundational models for science.\n"
317
+ ]
318
+ }
319
+ ],
320
+ "source": [
321
+ "query_engine = index.as_query_engine()\n",
322
+ "query = query_utils.tweak_query('What is the AI Alliance doing in the area of material science?', MY_CONFIG.LLM_MODEL)\n",
323
+ "res = query_engine.query(query)\n",
324
+ "print(res)"
325
+ ]
326
+ },
327
+ {
328
+ "cell_type": "code",
329
+ "execution_count": 12,
330
+ "metadata": {},
331
+ "outputs": [
332
+ {
333
+ "name": "stdout",
334
+ "output_type": "stream",
335
+ "text": [
336
+ "By submitting this form, you agree that the AI Alliance will collect and process the personal information you provide to keep you informed about AI Alliance initiatives and enable your involvement in AI Alliance activities. Additionally, you agree that the AI Alliance may share the personal information you provide with its member organizations so that they may communicate with you about AI Alliance initiatives and your involvement in AI Alliance activities.\n",
337
+ "\n",
338
+ "You may withdraw your consent for the processing of your personal information by the AI Alliance. Please contact us to request a permanent deletion.\n"
339
+ ]
340
+ }
341
+ ],
342
+ "source": [
343
+ "query_engine = index.as_query_engine()\n",
344
+ "query = query_utils.tweak_query('How do I join the AI Alliance?', MY_CONFIG.LLM_MODEL)\n",
345
+ "res = query_engine.query(query)\n",
346
+ "print(res)"
347
+ ]
348
+ },
349
+ {
350
+ "cell_type": "code",
351
+ "execution_count": 13,
352
+ "metadata": {},
353
+ "outputs": [
354
+ {
355
+ "name": "stdout",
356
+ "output_type": "stream",
357
+ "text": [
358
+ "The context does not provide information about the moon landing.\n"
359
+ ]
360
+ }
361
+ ],
362
+ "source": [
363
+ "query_engine = index.as_query_engine()\n",
364
+ "query = query_utils.tweak_query('When was the moon landing?', MY_CONFIG.LLM_MODEL)\n",
365
+ "res = query_engine.query(query)\n",
366
+ "print(res)"
367
+ ]
368
+ },
369
+ {
370
+ "cell_type": "code",
371
+ "execution_count": null,
372
+ "metadata": {},
373
+ "outputs": [],
374
+ "source": []
375
+ }
376
+ ],
377
+ "metadata": {
378
+ "kernelspec": {
379
+ "display_name": "allycat-1",
380
+ "language": "python",
381
+ "name": "python3"
382
+ },
383
+ "language_info": {
384
+ "codemirror_mode": {
385
+ "name": "ipython",
386
+ "version": 3
387
+ },
388
+ "file_extension": ".py",
389
+ "mimetype": "text/x-python",
390
+ "name": "python",
391
+ "nbconvert_exporter": "python",
392
+ "pygments_lexer": "ipython3",
393
+ "version": "3.11.12"
394
+ }
395
+ },
396
+ "nbformat": 4,
397
+ "nbformat_minor": 4
398
+ }
4_query.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Vector RAG Query
3
+ """
4
+
5
+ import os
6
+ from my_config import MY_CONFIG
7
+
8
+ # If connection to https://huggingface.co/ failed, uncomment the following path
9
+ os.environ['HF_ENDPOINT'] = MY_CONFIG.HF_ENDPOINT
10
+
11
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
12
+ from llama_index.core import Settings
13
+ from llama_index.core import VectorStoreIndex, StorageContext
14
+ from llama_index.vector_stores.milvus import MilvusVectorStore
15
+ from dotenv import load_dotenv
16
+ from llama_index.llms.litellm import LiteLLM
17
+ import query_utils
18
+ import time
19
+ import logging
20
+ import json
21
+
22
+ # Create logs directory if it doesn't exist
23
+ os.makedirs('logs/query', exist_ok=True)
24
+
25
+ logging.basicConfig(
26
+ level=logging.INFO,
27
+ format='%(asctime)s - %(levelname)s - %(message)s',
28
+ handlers=[
29
+ logging.FileHandler('logs/query/query_log.txt', mode='a'), # Save to file
30
+ logging.StreamHandler() # Also show in console
31
+ ],
32
+ force=True
33
+ )
34
+ logger = logging.getLogger(__name__)
35
+ logger.setLevel(logging.INFO)
36
+
37
+
38
+ def run_query(query: str):
39
+ global query_engine
40
+ logger.info("-----------------------------------")
41
+ start_time = time.time()
42
+ query = query_utils.tweak_query(query, MY_CONFIG.LLM_MODEL)
43
+ logger.info (f"\nProcessing Query:\n{query}")
44
+
45
+ # Get initial vector response
46
+ vector_response = query_engine.query(query)
47
+ vector_text = str(vector_response).strip()
48
+
49
+ # Structured prompt
50
+ structured_prompt = f"""Please provide a comprehensive, well-structured answer using the provided document information.
51
+
52
+ Question: {query}
53
+
54
+ Document Information:
55
+ {vector_text}
56
+
57
+ Instructions:
58
+ 1. Provide accurate, factual information based on the documents
59
+ 2. Structure your response clearly with proper formatting
60
+ 3. Be comprehensive yet concise
61
+ 4. Highlight key relationships and important details when relevant
62
+ 5. Use bullet points or sections when appropriate for clarity
63
+
64
+ Please provide your answer:"""
65
+
66
+ # Use structured prompt for final synthesis
67
+ res = query_engine.query(structured_prompt)
68
+
69
+ end_time = time.time()
70
+ total_time = end_time - start_time
71
+ logger.info ( "-------"
72
+ + f"\nResponse:\n{res}"
73
+ + f"\n\n⏱️ Total time: {total_time:.1f} seconds"
74
+ + f"\n\nResponse Metadata:\n{json.dumps(res.metadata, indent=2)}"
75
+ + f"\nSource Nodes: {[node.node_id for node in res.source_nodes]}"
76
+ )
77
+ logger.info("-----------------------------------")
78
+
79
+ # Save response and metadata to files
80
+ _save_query_files(query, res, total_time)
81
+
82
+ return res
83
+
84
+ def _save_query_files(query: str, response, total_time: float):
85
+ """Save query response and metadata to files."""
86
+ import time
87
+ timestamp = time.strftime('%Y-%m-%d %H:%M:%S')
88
+
89
+ try:
90
+ # Save response to file
91
+ with open('logs/query/query_responses.txt', 'a', encoding='utf-8') as f:
92
+ f.write(f"\n{'='*80}\n")
93
+ f.write(f"QUERY [{timestamp}]: {query}\n")
94
+ f.write(f"{'='*80}\n")
95
+ f.write(f"RESPONSE: {response}\n")
96
+ f.write(f"TIME: {total_time:.1f} seconds\n")
97
+ f.write(f"{'='*80}\n\n")
98
+
99
+ # Save metadata to file
100
+ with open('logs/query/query_metadata.txt', 'a', encoding='utf-8') as f:
101
+ f.write(f"\n{'='*80}\n")
102
+ f.write(f"METADATA [{timestamp}]: {query}\n")
103
+ f.write(f"{'='*80}\n")
104
+ f.write(f"TIME: {total_time:.1f} seconds\n")
105
+ f.write(json.dumps(response.metadata, indent=2, default=str))
106
+ f.write(f"\n{'='*80}\n\n")
107
+
108
+ logger.info(f"Saved response and metadata for query: {query[:50]}...")
109
+ except Exception as e:
110
+ logger.error(f"Failed to save query files: {e}")
111
+
112
+ ## ======= end : run_query =======
113
+
114
+ ## load env config
115
+ load_dotenv()
116
+
117
+ # Setup embeddings
118
+ Settings.embed_model = HuggingFaceEmbedding(
119
+ model_name = MY_CONFIG.EMBEDDING_MODEL
120
+ )
121
+ logger.info (f"✅ Using embedding model: {MY_CONFIG.EMBEDDING_MODEL}")
122
+
123
+ # Connect to vector database based on configuration
124
+ if MY_CONFIG.VECTOR_DB_TYPE == "cloud_zilliz":
125
+ # Use Zilliz Cloud
126
+ if not MY_CONFIG.ZILLIZ_CLUSTER_ENDPOINT or not MY_CONFIG.ZILLIZ_TOKEN:
127
+ raise ValueError("Cloud database configuration missing. Set ZILLIZ_CLUSTER_ENDPOINT and ZILLIZ_TOKEN in .env")
128
+
129
+ vector_store = MilvusVectorStore(
130
+ uri=MY_CONFIG.ZILLIZ_CLUSTER_ENDPOINT,
131
+ token=MY_CONFIG.ZILLIZ_TOKEN,
132
+ dim=MY_CONFIG.EMBEDDING_LENGTH,
133
+ collection_name=MY_CONFIG.COLLECTION_NAME,
134
+ overwrite=False
135
+ )
136
+ storage_context = StorageContext.from_defaults(vector_store=vector_store)
137
+ logger.info("Connected to cloud vector database")
138
+ else:
139
+ # Use local Milvus (default)
140
+ vector_store = MilvusVectorStore(
141
+ uri=MY_CONFIG.MILVUS_URI_VECTOR,
142
+ dim=MY_CONFIG.EMBEDDING_LENGTH,
143
+ collection_name=MY_CONFIG.COLLECTION_NAME,
144
+ overwrite=False
145
+ )
146
+ storage_context = StorageContext.from_defaults(vector_store=vector_store)
147
+ logger.info("Connected to local vector database")
148
+
149
+ # Load Document Index from database
150
+ index = VectorStoreIndex.from_vector_store(
151
+ vector_store=vector_store, storage_context=storage_context)
152
+ logger.info("Vector index loaded successfully")
153
+
154
+ # Setup LLM
155
+ logger.info (f"✅ Using LLM model : {MY_CONFIG.LLM_MODEL}")
156
+ Settings.llm = LiteLLM (
157
+ model=MY_CONFIG.LLM_MODEL,
158
+ )
159
+
160
+ query_engine = index.as_query_engine()
161
+
162
+ # Sample queries
163
+ queries = [
164
+ # "What is AI Alliance?",
165
+ # "What are the main focus areas of AI Alliance?",
166
+ # "What are some ai alliance projects?",
167
+ # "What are the upcoming events?",
168
+ # "How do I join the AI Alliance?",
169
+ # "When was the moon landing?",
170
+ ]
171
+
172
+ for query in queries:
173
+ run_query(query)
174
+
175
+ logger.info("-----------------------------------")
176
+
177
+ while True:
178
+ # Get user input
179
+ user_query = input("\nEnter your question (or 'q' to exit): ")
180
+
181
+ # Check if user wants to quit
182
+ if user_query.lower() in ['quit', 'exit', 'q']:
183
+ logger.info ("Goodbye!")
184
+ break
185
+
186
+ # Process the query
187
+ if user_query.strip() == "":
188
+ continue
189
+
190
+ try:
191
+ run_query(user_query)
192
+ except Exception as e:
193
+ logger.error(f"Error processing query: {e}")
194
+ print(f"Error processing query: {e}")
4b_query_graph copy.py ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ GraphRAG Implementation - Main Query Engine
3
+
4
+ Imports Step 1 functionality from query-graph-functions/setup.py
5
+ and implements the complete 25-step DRIFT search methodology.
6
+ """
7
+
8
+ import time
9
+ import logging
10
+ import json
11
+ import importlib
12
+ import sys
13
+ import os
14
+ import asyncio
15
+ from typing import Dict, Any
16
+
17
+ # Apply nest_asyncio to allow nested event loops
18
+ import nest_asyncio
19
+ nest_asyncio.apply()
20
+
21
+ # Import Step 1 functionality from setup module
22
+ from query_graph_functions.setup import create_graphrag_setup
23
+ # Import Steps 3-5 functionality from query preprocessing module
24
+ from query_graph_functions.query_preprocessing import (
25
+ create_query_preprocessor,
26
+ preprocess_query_pipeline
27
+ )
28
+ # Import Steps 6-8 functionality from knowledge retrieval module
29
+ from query_graph_functions.knowledge_retrieval import CommunitySearchEngine
30
+ # Import Steps 9-12 functionality from follow-up search module
31
+ from query_graph_functions.follow_up_search import FollowUpSearch
32
+ # Import Steps 13-14 functionality from vector augmentation module
33
+ from query_graph_functions.vector_augmentation import VectorAugmentationEngine
34
+ # Import Steps 15-16 functionality from answer synthesis module
35
+ from query_graph_functions.answer_synthesis import AnswerSynthesisEngine
36
+ # Import Steps 17-20 functionality from response management module
37
+ from query_graph_functions.response_management import ResponseManager
38
+ from my_config import MY_CONFIG
39
+ import query_utils
40
+
41
+ # Configure logging - Save to file and console
42
+ logging.basicConfig(
43
+ level=logging.INFO,
44
+ format='%(asctime)s - %(levelname)s - %(message)s',
45
+ handlers=[
46
+ logging.FileHandler('logs/graphrag_query/graphrag_query_log.txt', mode='a'), # Save to file
47
+ logging.StreamHandler() # Also show in console
48
+ ],
49
+ force=True
50
+ )
51
+ logger = logging.getLogger(__name__)
52
+ logger.setLevel(logging.INFO)
53
+
54
+ # Log session start
55
+ logger.info("=" * 80)
56
+ logger.info(f"GraphRAG Session Started - {time.strftime('%Y-%m-%d %H:%M:%S')}")
57
+ logger.info("=" * 80)
58
+
59
+
60
+ class GraphQueryEngine:
61
+ """
62
+ GraphRAG Query Engine - Complete Implementation
63
+
64
+ Uses setup module for Step 1 initialization and query preprocessing
65
+ module for Steps 3-5, implementing the full 25-step DRIFT search methodology.
66
+ """
67
+
68
+ def __init__(self):
69
+ logger.info("GraphRAG Query Engine Initializing")
70
+
71
+ # Initialize using setup module (Step 1)
72
+ self.setup = create_graphrag_setup()
73
+
74
+ # Extract components from setup
75
+ self.neo4j_conn = self.setup.neo4j_conn
76
+ self.query_engine = self.setup.query_engine
77
+ self.graph_stats = self.setup.graph_stats
78
+ self.drift_config = self.setup.drift_config
79
+ self.llm = self.setup.llm
80
+ self.config = self.setup.config
81
+
82
+ # Initialize query preprocessor (Steps 3-5) - will be created async
83
+ self.query_preprocessor = None
84
+
85
+ # Initialize response manager (Steps 17-20)
86
+ self.response_manager = ResponseManager(self.setup)
87
+
88
+ logger.info("GraphRAG Query Engine Ready")
89
+
90
+ async def run_query_async(self, user_query: str) -> Dict[str, Any]:
91
+ """
92
+ GraphRAG Query Pipeline - Main Entry Point (Async)
93
+
94
+ Implements Phase B (Steps 3-5) of the 25-step DRIFT search methodology
95
+ """
96
+ logger.info("=" * 60)
97
+ logger.info("GraphRAG Query Pipeline Starting")
98
+ logger.info("=" * 60)
99
+
100
+ start_time = time.time()
101
+
102
+ # Apply query optimization
103
+ optimized_query = query_utils.tweak_query(user_query, MY_CONFIG.LLM_MODEL)
104
+ logger.info(f"Original Query: {user_query}")
105
+ if optimized_query != user_query:
106
+ logger.info(f"Optimized Query: {optimized_query}")
107
+
108
+ try:
109
+ # Validate system readiness using setup module
110
+ if not self.setup.validate_system_readiness():
111
+ return self._generate_error_response("System not properly initialized")
112
+
113
+ # PHASE B: QUERY PREPROCESSING (Steps 3-5)
114
+ logger.info("Phase B: Starting Query Preprocessing (Steps 3-5)")
115
+
116
+ # Initialize query preprocessor if needed
117
+ if not self.query_preprocessor:
118
+ self.query_preprocessor = await create_query_preprocessor(
119
+ self.config, self.graph_stats
120
+ )
121
+
122
+ # Execute complete preprocessing pipeline
123
+ analysis, routing, vectorization = await preprocess_query_pipeline(
124
+ optimized_query, self.config, self.graph_stats
125
+ )
126
+
127
+ logger.info(f"Phase B Completed: "
128
+ f"Type={analysis.query_type.value}, "
129
+ f"Strategy={routing.search_strategy.value}")
130
+
131
+ # PHASE C: COMMUNITY RETRIEVAL (Steps 6-7)
132
+ logger.info("Phase C: Starting Community Retrieval (Steps 6-7)")
133
+
134
+ # Create community search engine
135
+ community_engine = CommunitySearchEngine(self.setup)
136
+
137
+ # Execute the primer phase (Steps 6-8)
138
+ community_results = await community_engine.execute_primer_phase(
139
+ vectorization.embedding, routing
140
+ )
141
+
142
+ # Extract communities for Phase D
143
+ communities = community_results['communities']
144
+
145
+ logger.info(f"Phase C Completed: Retrieved {len(communities)} communities")
146
+
147
+ # PHASE D: FOLLOW-UP SEARCH (Steps 9-12)
148
+ logger.info("Phase D: Starting Follow-up Search (Steps 9-12)")
149
+
150
+ # Create follow-up search engine
151
+ follow_up_engine = FollowUpSearch(self.setup)
152
+
153
+ # Execute follow-up search phase
154
+ follow_up_results = await follow_up_engine.execute_follow_up_phase(
155
+ community_results, routing
156
+ )
157
+
158
+ logger.info(f"Phase D Completed: Generated {len(follow_up_results.get('intermediate_answers', []))} detailed answers")
159
+
160
+ # PHASE E: VECTOR SEARCH AUGMENTATION (Steps 13-14)
161
+ logger.info("Phase E: Starting Vector Search Augmentation (Steps 13-14)")
162
+
163
+ # Create vector augmentation engine
164
+ vector_engine = VectorAugmentationEngine(self.setup)
165
+
166
+ # Execute vector augmentation phase
167
+ augmentation_results = await vector_engine.execute_vector_augmentation_phase(
168
+ vectorization.embedding,
169
+ {'communities': communities, 'initial_answer': community_results['initial_answer'], 'follow_up_results': follow_up_results},
170
+ routing
171
+ )
172
+
173
+ logger.info(f"Phase E Completed: Vector augmentation confidence: {augmentation_results.augmentation_confidence:.3f}")
174
+
175
+ # PHASE F: ANSWER SYNTHESIS (Steps 15-16)
176
+ logger.info("Phase F: Starting Answer Synthesis (Steps 15-16)")
177
+
178
+ # Create answer synthesis engine
179
+ synthesis_engine = AnswerSynthesisEngine(self.setup)
180
+
181
+ # Execute comprehensive answer synthesis
182
+ synthesis_results = await synthesis_engine.execute_answer_synthesis_phase(
183
+ analysis, routing, community_results, follow_up_results, augmentation_results
184
+ )
185
+
186
+ logger.info(f"Phase F Completed: Final synthesis confidence: {synthesis_results.confidence_score:.3f}")
187
+
188
+ # PHASE G: RESPONSE MANAGEMENT (Steps 17-20)
189
+ logger.info("Phase G: Starting Response Management (Steps 17-20)")
190
+
191
+ # Enhanced implementation using preprocessing results
192
+ if self.query_engine:
193
+ # Use the vectorized query for better results
194
+ _ = self.query_engine.query(vectorization.normalized_query)
195
+ total_time = time.time() - start_time
196
+
197
+ logger.info(f"Enhanced Query Completed in {total_time:.2f}s")
198
+ logger.info("=" * 60)
199
+
200
+ # Use Phase F synthesis result as the final answer
201
+ enhanced_answer = synthesis_results.final_answer
202
+
203
+ # Generate comprehensive metadata using ResponseManager
204
+ metadata = self.response_manager.generate_metadata(
205
+ analysis=analysis,
206
+ routing=routing,
207
+ vectorization=vectorization,
208
+ community_results=community_results,
209
+ follow_up_results=follow_up_results,
210
+ augmentation_results=augmentation_results,
211
+ synthesis_results=synthesis_results,
212
+ total_time=total_time,
213
+ graph_stats=self.graph_stats,
214
+ config=self.config
215
+ )
216
+
217
+ result = {
218
+ "answer": enhanced_answer,
219
+ "metadata": metadata
220
+ }
221
+
222
+ # Save response and metadata to files using ResponseManager
223
+ self.response_manager.save_response_to_files(user_query, result)
224
+
225
+ logger.info("Phase G Completed: Response management finished")
226
+
227
+ return result
228
+ else:
229
+ return await synthesis_engine.generate_error_response("Query engine not available")
230
+
231
+ except Exception as e:
232
+ logger.error(f"Query Pipeline Failed: {e}")
233
+ return await synthesis_engine.generate_error_response(f"Query processing error: {e}")
234
+
235
+ def run_query(self, user_query: str) -> Dict[str, Any]:
236
+ """
237
+ Synchronous wrapper for async query processing.
238
+
239
+ This maintains backward compatibility while using the new async pipeline.
240
+ Uses nest_asyncio and our LiteLLM patch to properly handle async tasks.
241
+ """
242
+ try:
243
+ # Use the current event loop since nest_asyncio.apply() has been called
244
+ loop = asyncio.get_event_loop()
245
+
246
+ # Create a future to gather all tasks and wait for completion
247
+ async def run_with_cleanup():
248
+ try:
249
+ # Run the main query
250
+ result = await self.run_query_async(user_query)
251
+
252
+ # Use setup module's cleanup function
253
+ await self.setup.cleanup_async_tasks(timeout=2.0)
254
+
255
+ return result
256
+ except Exception as e:
257
+ logger.error(f"Async Query Execution Failed: {e}")
258
+ raise e
259
+
260
+ # Run the async function with cleanup
261
+ return loop.run_until_complete(run_with_cleanup())
262
+
263
+ except Exception as e:
264
+ logger.error(f"Sync Query Wrapper Failed: {e}")
265
+ # Use synthesis engine for error handling
266
+ synthesis_engine = AnswerSynthesisEngine(self.setup)
267
+ loop = asyncio.get_event_loop()
268
+ return loop.run_until_complete(
269
+ synthesis_engine.generate_error_response(f"Query processing error: {e}")
270
+ )
271
+
272
+ def close(self):
273
+ """Clean up connections using setup module"""
274
+ if self.setup:
275
+ self.setup.close()
276
+ logger.info("GraphQueryEngine cleanup complete")
277
+
278
+
279
+ if __name__ == "__main__":
280
+ print("GraphRAG Implementation - Hot Reload Enabled")
281
+ print("=" * 50)
282
+ print("Step 1: Initialization and Connection")
283
+ print("Hot Reload: Type 'r' to reload modules")
284
+ print("=" * 50)
285
+
286
+ engine = GraphQueryEngine()
287
+
288
+ try:
289
+ # Create an event loop for the main thread
290
+ loop = asyncio.new_event_loop()
291
+ asyncio.set_event_loop(loop)
292
+
293
+ while True:
294
+ user_query = input("\nEnter your question ('q' to exit, 'r' to reload): ")
295
+
296
+ if user_query.lower() in ['quit', 'exit', 'q']:
297
+ print("Goodbye!")
298
+ break
299
+
300
+ if user_query.lower() == 'r':
301
+ print("Reloading...")
302
+ engine.close()
303
+
304
+ # Run cleanup tasks before reloading using setup module
305
+ loop.run_until_complete(
306
+ engine.setup.cleanup_async_tasks(timeout=3.0) if engine.setup else None
307
+ )
308
+
309
+ engine = GraphQueryEngine()
310
+ print("Reloaded!")
311
+ continue
312
+
313
+ if user_query.strip() == "":
314
+ continue
315
+
316
+ # Direct method call - clean forward-only implementation
317
+ result = engine.run_query(user_query)
318
+
319
+ # Print results
320
+ print("\n" + "=" * 60)
321
+ print("GraphRAG Query Results")
322
+ print("=" * 60)
323
+ print(f"Answer: {result['answer']}")
324
+ print(f"\nMetadata: {json.dumps(result['metadata'], indent=2)}")
325
+ print("=" * 60)
326
+
327
+ except Exception as e:
328
+ logger.error(f"Error processing query: {e}")
329
+ print(f"Error processing query: {e}")
330
+ finally:
331
+ # Run final cleanup before exiting using setup module
332
+ if 'loop' in locals() and 'engine' in locals():
333
+ loop.run_until_complete(
334
+ engine.setup.cleanup_async_tasks(timeout=5.0) if engine.setup else None
335
+ )
336
+ loop.close()
337
+ if 'engine' in locals():
338
+ engine.close()
4b_query_graph.py ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ GraphRAG Query
3
+ """
4
+
5
+ import time
6
+ import logging
7
+ import json
8
+ import importlib
9
+ import sys
10
+ import os
11
+ import asyncio
12
+ from typing import Dict, Any
13
+
14
+ # Apply nest_asyncio to allow nested event loops
15
+ import nest_asyncio
16
+ nest_asyncio.apply()
17
+
18
+ # Import Step 1 functionality from setup module
19
+ from query_graph_functions.setup import create_graphrag_setup
20
+ # Import Steps 3-5 functionality from query preprocessing module
21
+ from query_graph_functions.query_preprocessing import (
22
+ create_query_preprocessor,
23
+ preprocess_query_pipeline
24
+ )
25
+ # Import Steps 6-8 functionality from knowledge retrieval module
26
+ from query_graph_functions.knowledge_retrieval import CommunitySearchEngine
27
+ # Import Steps 9-12 functionality from follow-up search module
28
+ from query_graph_functions.follow_up_search import FollowUpSearch
29
+ # Import Steps 13-14 functionality from vector augmentation module
30
+ from query_graph_functions.vector_augmentation import VectorAugmentationEngine
31
+ # Import Steps 15-16 functionality from answer synthesis module
32
+ from query_graph_functions.answer_synthesis import AnswerSynthesisEngine
33
+ # Import Steps 17-20 functionality from response management module
34
+ from query_graph_functions.response_management import ResponseManager
35
+ from my_config import MY_CONFIG
36
+ import query_utils
37
+
38
+ # Create logs directory if it doesn't exist
39
+ os.makedirs('logs/graphrag_query', exist_ok=True)
40
+
41
+ # Configure logging - Save to file and console
42
+ logging.basicConfig(
43
+ level=logging.INFO,
44
+ format='%(asctime)s - %(levelname)s - %(message)s',
45
+ handlers=[
46
+ logging.FileHandler('logs/graphrag_query/graphrag_query_log.txt', mode='a'), # Save to file
47
+ logging.StreamHandler() # Also show in console
48
+ ],
49
+ force=True
50
+ )
51
+ logger = logging.getLogger(__name__)
52
+ logger.setLevel(logging.INFO)
53
+
54
+ # Log session start
55
+ logger.info("=" * 80)
56
+ logger.info(f"GraphRAG Session Started - {time.strftime('%Y-%m-%d %H:%M:%S')}")
57
+ logger.info("=" * 80)
58
+
59
+
60
+ class GraphQueryEngine:
61
+ def __init__(self):
62
+ logger.info("GraphRAG Query Engine Initializing")
63
+
64
+ # Initialize using setup module (Step 1)
65
+ self.setup = create_graphrag_setup()
66
+
67
+ # Extract components from setup
68
+ self.neo4j_conn = self.setup.neo4j_conn
69
+ self.query_engine = self.setup.query_engine
70
+ self.graph_stats = self.setup.graph_stats
71
+ self.drift_config = self.setup.drift_config
72
+ self.llm = self.setup.llm
73
+ self.config = self.setup.config
74
+
75
+ # Initialize query preprocessor (Steps 3-5) - will be created async
76
+ self.query_preprocessor = None
77
+
78
+ # Initialize response manager (Steps 17-20)
79
+ self.response_manager = ResponseManager(self.setup)
80
+
81
+ logger.info("GraphRAG Query Engine Ready")
82
+
83
+ async def run_query_async(self, user_query: str) -> Dict[str, Any]:
84
+ """
85
+ GraphRAG Query Pipeline - Main Entry Point (Async)
86
+
87
+ Implements Phase B (Steps 3-5) of the 25-step DRIFT search methodology
88
+ """
89
+ logger.info("=" * 60)
90
+ logger.info("GraphRAG Query Pipeline Starting")
91
+ logger.info("=" * 60)
92
+
93
+ start_time = time.time()
94
+
95
+ # Apply query optimization
96
+ optimized_query = query_utils.tweak_query(user_query, MY_CONFIG.LLM_MODEL)
97
+ logger.info(f"Original Query: {user_query}")
98
+ if optimized_query != user_query:
99
+ logger.info(f"Optimized Query: {optimized_query}")
100
+
101
+ try:
102
+ # Validate system readiness using setup module
103
+ if not self.setup.validate_system_readiness():
104
+ return self._generate_error_response("System not properly initialized")
105
+
106
+ # PHASE B: QUERY PREPROCESSING (Steps 3-5)
107
+ logger.info("Phase B: Starting Query Preprocessing (Steps 3-5)")
108
+
109
+ # Initialize query preprocessor if needed
110
+ if not self.query_preprocessor:
111
+ self.query_preprocessor = await create_query_preprocessor(
112
+ self.config, self.graph_stats
113
+ )
114
+
115
+ # Execute complete preprocessing pipeline
116
+ analysis, routing, vectorization = await preprocess_query_pipeline(
117
+ optimized_query, self.config, self.graph_stats
118
+ )
119
+
120
+ logger.info(f"Phase B Completed: "
121
+ f"Type={analysis.query_type.value}, "
122
+ f"Strategy={routing.search_strategy.value}")
123
+
124
+ # PHASE C: COMMUNITY RETRIEVAL (Steps 6-7)
125
+ logger.info("Phase C: Starting Community Retrieval (Steps 6-7)")
126
+
127
+ # Create community search engine
128
+ community_engine = CommunitySearchEngine(self.setup)
129
+
130
+ # Execute the primer phase (Steps 6-8)
131
+ community_results = await community_engine.execute_primer_phase(
132
+ vectorization.embedding, routing
133
+ )
134
+
135
+ # Extract communities for Phase D
136
+ communities = community_results['communities']
137
+
138
+ logger.info(f"Phase C Completed: Retrieved {len(communities)} communities")
139
+
140
+ # PHASE D: FOLLOW-UP SEARCH (Steps 9-12)
141
+ logger.info("Phase D: Starting Follow-up Search (Steps 9-12)")
142
+
143
+ # Create follow-up search engine
144
+ follow_up_engine = FollowUpSearch(self.setup)
145
+
146
+ # Execute follow-up search phase
147
+ follow_up_results = await follow_up_engine.execute_follow_up_phase(
148
+ community_results, routing
149
+ )
150
+
151
+ logger.info(f"Phase D Completed: Generated {len(follow_up_results.get('intermediate_answers', []))} detailed answers")
152
+
153
+ # PHASE E: VECTOR SEARCH AUGMENTATION (Steps 13-14)
154
+ logger.info("Phase E: Starting Vector Search Augmentation (Steps 13-14)")
155
+
156
+ # Create vector augmentation engine
157
+ vector_engine = VectorAugmentationEngine(self.setup)
158
+
159
+ # Execute vector augmentation phase
160
+ augmentation_results = await vector_engine.execute_vector_augmentation_phase(
161
+ vectorization.embedding,
162
+ {'communities': communities, 'initial_answer': community_results['initial_answer'], 'follow_up_results': follow_up_results},
163
+ routing
164
+ )
165
+
166
+ logger.info(f"Phase E Completed: Vector augmentation confidence: {augmentation_results.augmentation_confidence:.3f}")
167
+
168
+ # PHASE F: ANSWER SYNTHESIS (Steps 15-16)
169
+ logger.info("Phase F: Starting Answer Synthesis (Steps 15-16)")
170
+
171
+ # Create answer synthesis engine
172
+ synthesis_engine = AnswerSynthesisEngine(self.setup)
173
+
174
+ # Execute comprehensive answer synthesis
175
+ synthesis_results = await synthesis_engine.execute_answer_synthesis_phase(
176
+ analysis, routing, community_results, follow_up_results, augmentation_results
177
+ )
178
+
179
+ logger.info(f"Phase F Completed: Final synthesis confidence: {synthesis_results.confidence_score:.3f}")
180
+
181
+ # PHASE G: RESPONSE MANAGEMENT (Steps 17-20)
182
+ logger.info("Phase G: Starting Response Management (Steps 17-20)")
183
+
184
+ # Enhanced implementation using preprocessing results
185
+ if self.query_engine:
186
+ # Use the vectorized query for better results
187
+ _ = self.query_engine.query(vectorization.normalized_query)
188
+ total_time = time.time() - start_time
189
+
190
+ logger.info(f"Enhanced Query Completed in {total_time:.2f}s")
191
+ logger.info("=" * 60)
192
+
193
+ # Use Phase F synthesis result as the final answer
194
+ enhanced_answer = synthesis_results.final_answer
195
+
196
+ # Generate comprehensive metadata using ResponseManager
197
+ metadata = self.response_manager.generate_comprehensive_metadata(
198
+ analysis=analysis,
199
+ routing=routing,
200
+ vectorization=vectorization,
201
+ community_results=community_results,
202
+ follow_up_results=follow_up_results,
203
+ augmentation_results=augmentation_results,
204
+ synthesis_results=synthesis_results,
205
+ total_time=total_time
206
+ )
207
+
208
+ result = {
209
+ "answer": enhanced_answer,
210
+ "metadata": metadata
211
+ }
212
+
213
+ # Save response and metadata to files using ResponseManager
214
+ self.response_manager.save_response_to_files(user_query, result)
215
+
216
+ logger.info("Phase G Completed: Response management finished")
217
+
218
+ return result
219
+ else:
220
+ return synthesis_engine.generate_error_response("Query engine not available")
221
+
222
+ except Exception as e:
223
+ logger.error(f"Query Pipeline Failed: {e}")
224
+ synthesis_engine = AnswerSynthesisEngine(self.setup)
225
+ return synthesis_engine.generate_error_response(f"Query processing error: {e}")
226
+
227
+ def run_query(self, user_query: str) -> Dict[str, Any]:
228
+ """
229
+ Synchronous wrapper for async query processing.
230
+
231
+ This maintains backward compatibility while using the new async pipeline.
232
+ Uses nest_asyncio and our LiteLLM patch to properly handle async tasks.
233
+ """
234
+ try:
235
+ # Use the current event loop since nest_asyncio.apply() has been called
236
+ loop = asyncio.get_event_loop()
237
+
238
+ # Create a future to gather all tasks and wait for completion
239
+ async def run_with_cleanup():
240
+ try:
241
+ # Run the main query
242
+ result = await self.run_query_async(user_query)
243
+
244
+ # Use setup module's cleanup function
245
+ await self.setup.cleanup_async_tasks(timeout=2.0)
246
+
247
+ return result
248
+ except Exception as e:
249
+ logger.error(f"Async Query Execution Failed: {e}")
250
+ raise e
251
+
252
+ # Run the async function with cleanup
253
+ return loop.run_until_complete(run_with_cleanup())
254
+
255
+ except Exception as e:
256
+ logger.error(f"Sync Query Wrapper Failed: {e}")
257
+ # Use synthesis engine for error handling
258
+ synthesis_engine = AnswerSynthesisEngine(self.setup)
259
+ return synthesis_engine.generate_error_response(f"Query processing error: {e}")
260
+
261
+ def close(self):
262
+ """Clean up connections using setup module"""
263
+ if self.setup:
264
+ self.setup.close()
265
+ logger.info("GraphQueryEngine cleanup complete")
266
+
267
+
268
+ if __name__ == "__main__":
269
+ print("GraphRAG Implementation - Hot Reload Enabled")
270
+ print("=" * 50)
271
+ print("Step 1: Initialization and Connection")
272
+ print("Hot Reload: Type 'r' to reload modules")
273
+ print("=" * 50)
274
+
275
+ engine = GraphQueryEngine()
276
+
277
+ try:
278
+ # Create an event loop for the main thread
279
+ loop = asyncio.new_event_loop()
280
+ asyncio.set_event_loop(loop)
281
+
282
+ while True:
283
+ user_query = input("\nEnter your question ('q' to exit, 'r' to reload): ")
284
+
285
+ if user_query.lower() in ['quit', 'exit', 'q']:
286
+ print("Goodbye!")
287
+ break
288
+
289
+ if user_query.lower() == 'r':
290
+ print("Reloading...")
291
+ engine.close()
292
+
293
+ # Run cleanup tasks before reloading using setup module
294
+ loop.run_until_complete(
295
+ engine.setup.cleanup_async_tasks(timeout=3.0) if engine.setup else None
296
+ )
297
+
298
+ engine = GraphQueryEngine()
299
+ print("Reloaded!")
300
+ continue
301
+
302
+ if user_query.strip() == "":
303
+ continue
304
+
305
+ # Direct method call - clean forward-only implementation
306
+ result = engine.run_query(user_query)
307
+
308
+ # Print results
309
+ print("\n" + "=" * 60)
310
+ print("GraphRAG Query Results")
311
+ print("=" * 60)
312
+ print(f"Answer: {result['answer']}")
313
+ print(f"\nMetadata: {json.dumps(result['metadata'], indent=2)}")
314
+ print("=" * 60)
315
+
316
+ except Exception as e:
317
+ logger.error(f"Error processing query: {e}")
318
+ print(f"Error processing query: {e}")
319
+ finally:
320
+ # Run final cleanup before exiting using setup module
321
+ if 'loop' in locals() and 'engine' in locals():
322
+ loop.run_until_complete(
323
+ engine.setup.cleanup_async_tasks(timeout=5.0) if engine.setup else None
324
+ )
325
+ loop.close()
326
+ if 'engine' in locals():
327
+ engine.close()
CHANGELOG.md ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AllyCat Changelog
2
+
3
+ All notable technical changes to AllyCat GraphRAG will be documented in this file.
4
+
5
+ ## [Unreleased]
6
+
7
+ ### Added
8
+
9
+ #### GraphRAG Implementation
10
+ - **GraphRAG Core System**: Implemented Microsoft GraphRAG-inspired architecture
11
+ - Entity extraction and relationship mapping from documents
12
+ - Community detection algorithms for knowledge graph clustering
13
+ - Multi-phase graph processing pipeline (phase 1: entities/relationships, phase 2: communities, phase 3: summaries)
14
+ - Graph-based query system with hierarchical summarization
15
+ - Neo4j integration for graph database storage (`3b_save_to_graph_db.py`)
16
+ - Graph query functions in `query_graph_functions/` directory
17
+ - Dual RAG modes: Traditional Vector RAG + Advanced GraphRAG
18
+ - **Note**: More improvements planned based on [Microsoft GraphRAG Project](https://github.com/microsoft/graphrag)
19
+
20
+ #### LLM Provider Support
21
+ - **Cerebras API Integration**: Added support for Cerebras ultra-fast inference
22
+ - **Google Gemini API Integration**: Added support for Google's Gemini models
23
+ - **LiteLLM Framework**: Implemented `litellm_patch.py` for unified LLM API interface
24
+ - Supports multiple providers: OpenAI, Replicate, Nebius, Cerebras, Gemini, Anthropic, and more
25
+ - Simplified provider switching via environment variables
26
+
27
+ #### Database Solutions
28
+ - **Zilliz Cloud Integration**: Added cloud-based vector database support
29
+ - Implemented `3_save_to_vector_db_zilliz.py` for Zilliz Cloud
30
+ - Cloud vector database eliminates need for local Milvus server
31
+ - Configurable via `VECTOR_DB_TYPE` environment variable
32
+ - **Neo4j Graph Database**: Integrated for GraphRAG knowledge graph storage
33
+ - Stores entities, relationships, and community structures
34
+ - Enables complex graph traversal queries
35
+
36
+ #### Docker Deployment System
37
+ - **Three Deployment Modes**: Flexible deployment configurations
38
+ - **Cloud Mode** (`docker-compose.cloud.yml`): Cloud LLM + Cloud Zilliz vector DB
39
+ - **Hybrid Mode** (`docker-compose.hybrid.yml`): Cloud LLM + Local Milvus
40
+ - **Local Mode** (`docker-compose.local.yml`): Local Ollama + Local Milvus
41
+ - **Automated Deployment Script**: `docker-startup.sh` orchestrates full deployment
42
+ - Conditional service startup based on deployment mode
43
+ - Automatic Ollama model download for local mode
44
+ - Smart service detection and initialization
45
+
46
+ #### Automatic Pipeline Execution
47
+ - **End-to-End Automation**: Single command deployment from crawling to running application
48
+ - Automatic website crawling when `WEBSITE_URL` is set
49
+ - Sequential pipeline execution: crawl → process → vector DB → graph processing → graph DB
50
+ - Automatic application startup after pipeline completion
51
+ - Controlled via `AUTO_RUN_PIPELINE` environment variable
52
+ - **User Action Required**: Only set environment variables in `.env` file
53
+
54
+ #### Document Processing Improvements
55
+ - **HTML/HTM Processing**: Switched to `html2text` library for better HTML parsing
56
+ - Improved markdown conversion quality
57
+ - Better handling of HTML structure and formatting
58
+ - **Resolves**: [Issue #50](https://github.com/The-AI-Alliance/allycat/issues/50)
59
+ - **PDF Processing**: Integrated `docling` library for advanced PDF parsing
60
+ - High-quality PDF to markdown conversion
61
+ - Preserves document structure and formatting
62
+ - Handles complex PDF layouts
63
+
64
+ #### Port Management System
65
+ - **Multiple Dynamic Ports**: Flexible port configuration for different services
66
+ - `FLASK_GRAPH_PORT=8080` - Flask GraphRAG application
67
+ - `FLASK_VECTOR_PORT=8081` - Flask Vector RAG application
68
+ - `CHAINLIT_GRAPH_PORT=8083` - Chainlit GraphRAG application
69
+ - `CHAINLIT_VECTOR_PORT=8082` - Chainlit Vector RAG application
70
+ - `DOCKER_PORT` - Host machine port mapping
71
+ - `DOCKER_APP_PORT=8080` - Internal container port
72
+ - `OLLAMA_PORT=11434` - Ollama server port (local mode)
73
+ - **Smart Port Routing**: Automatic port selection based on `APP_TYPE` environment variable
74
+ - Supports: `flask_graph`, `flask`, `chainlit_graph`, `chainlit`
75
+
76
+ #### Memory Optimization System
77
+ - **CLEANUP_PIPELINE_DEPS Feature**: Post-pipeline dependency cleanup
78
+ - Created `requirements-runtime.txt` (~300 MB) - minimal packages for running Flask GraphRAG app
79
+ - Created `requirements-build.txt` (~500 MB) - pipeline-only packages that can be removed
80
+ - Created `cleanup_pipeline_deps.sh` - automated cleanup script
81
+ - Integrated cleanup into `docker-startup.sh` with conditional execution
82
+ - Added `CLEANUP_PIPELINE_DEPS` configuration to all `.env` sample files
83
+ - Created comprehensive technical documentation in `docs/docker-memory-optimization.md`
84
+ - Updated `docs/docker-deployment-guide.md` with memory optimization section
85
+ - **Benefits**: Reduces container RAM from ~800 MB to ~300 MB, enabling 1GB deployments
86
+ - **Cost Savings**: DigitalOcean 1GB ($12/mo) vs 2GB ($25/mo) = $156/year savings (52% reduction)
87
+
88
+ ### Changed
89
+ - **Chainlit Port Configuration**: Reverted Chainlit apps to use default port behavior
90
+ - Removed custom port configuration code from Python files
91
+ - Chainlit now uses default port 8000 for native Python execution
92
+ - Docker deployments use custom ports via `--port` flag in `docker-startup.sh`
93
+ - Updated documentation in `docs/graphrag-demo/Setup.md`, `docs/configuration.md`, and `my_config.py`
94
+ - Native Python: `chainlit run app_chainlit_graph.py` (port 8000) or with custom `--port 8083`
95
+ - Docker: Custom ports 8082 (vector) and 8083 (graph) configured via environment variables
96
+
97
+
98
+ ### Fixed
99
+ - **HTML/HTM File Processing**: Fixed HTML parsing issues ([Issue #50](https://github.com/The-AI-Alliance/allycat/issues/50))
100
+ - Switched from previous parser to `html2text` library
101
+ - Improved markdown conversion quality and reliability
102
+ - **Pipeline Error Handling**: Confirmed robust error handling with `|| echo "Warning..."` pattern
103
+ - Cleanup script runs even if pipeline steps fail
104
+ - Application starts successfully regardless of pipeline completion status
105
+
106
+ ## [Previous Versions]
107
+
108
+ ### 2025-07-14: Major Update
109
+
110
+ #### Added
111
+ - **Robust Web Crawler** ([#31](https://github.com/The-AI-Alliance/allycat/issues/31))
112
+ - Complete rewrite of web crawler
113
+ - More robust handling of edge cases
114
+ - Support for multiple file types (not just text/html)
115
+ - Correct handling of anchor tags (`a.html#news`) in HTML files
116
+ - Customizable pause between requests to avoid hammering webservers
117
+ - Fixed issue with repeatedly downloading same content
118
+
119
+ - **LiteLLM Integration** ([#34](https://github.com/The-AI-Alliance/allycat/issues/34))
120
+ - Unified LLM backend support replacing Replicate and Ollama setup
121
+ - Seamless access to local LLMs (using Ollama) and cloud inference providers
122
+ - Support for providers: Nebius, Replicate, and more
123
+ - Significantly simplified LLM configuration
124
+ - Added `python-dotenv` for environment variable management
125
+
126
+ - **Expanded File Type Support** ([#37](https://github.com/The-AI-Alliance/allycat/issues/37))
127
+ - Support for PDF, DOCX, and other popular file types (previously only HTML)
128
+ - Integration with [Docling](https://github.com/docling-project/docling) for file processing
129
+ - Fixed issue with PDF downloads ([#35](https://github.com/The-AI-Alliance/allycat/issues/35))
130
+ - Processing all downloaded file types
131
+ - Updated process_file script
132
+
133
+ - **UV Package Manager Support** ([#26](https://github.com/The-AI-Alliance/allycat/issues/26))
134
+ - Added [uv](https://docs.astral.sh/uv/) project structure
135
+ - Updated documentation for uv
136
+ - Continued support for `requirements.txt` and other package managers
137
+
138
+ - **Better Config Management** ([#19](https://github.com/The-AI-Alliance/allycat/issues/19))
139
+ - User configuration via `.env` file
140
+ - Simplified config management
141
+ - Easier experimentation without code changes
142
+ - Documented configuration options
143
+ - Updated env.sample file with settings
144
+
145
+ - **Metrics Collection**: Added metrics collection scripts and issue templates
146
+
147
+ #### Changed
148
+ - **Chainlit App Updates** ([#38](https://github.com/The-AI-Alliance/allycat/issues/38))
149
+ - Updated Chainlit application
150
+ - Customizable starter prompts
151
+ - **Logo Updates** ([#39](https://github.com/The-AI-Alliance/allycat/issues/39))
152
+ - Updated logo to AllyCAT
153
+ - **App Naming**: Changed Flask and Chainlit app names for clarity
154
+ - Code cleanup improvements
155
+ - Documentation updates across the project
156
+
157
+ ### 2025-05: Chainlit Integration
158
+
159
+ #### Added
160
+ - **Chainlit Chat Interface** ([#17](https://github.com/The-AI-Alliance/allycat/issues/17))
161
+ - Introduced Chainlit-based chat interface as alternative to Flask UI
162
+ - Improved chat UI experience
163
+
164
+ #### Changed
165
+ - Updated README with license and issues links
166
+
167
+ ### 2025-04: Dockerization and Local LLM Support
168
+
169
+ #### Added
170
+ - **Docker Support**: Complete dockerization of the application
171
+ - Docker deployment configurations
172
+ - Updated Google Cloud deployment guide
173
+ - Comprehensive Docker documentation (`running-in-docker.md`)
174
+ - **Ollama Integration**: Local LLM support with Ollama
175
+ - Local LLM configuration and setup
176
+ - Local Jupyter Lab support
177
+ - Small tweaks to local LLM config
178
+ - **Python Scripts**: Added Python script versions of notebooks
179
+
180
+ #### Changed
181
+ - Updated deployment documentation
182
+ - Native running documentation (`running-natively.md`)
183
+
184
+ ### 2025-03: Database and LLM Updates
185
+
186
+ #### Added
187
+ - **Weaviate Database**: Added Weaviate vector database support
188
+ - **Local LLM Support**: Initial local LLM integration
189
+
190
+ #### Changed
191
+ - **LLM Switch**: Changed from initial LLM to Llama
192
+ - Added logo and GitHub link to UI
193
+ - README and deploy guide updates
194
+
195
+ ### 2025-02: Initial Release - AllyCAT (formerly AllyChat)
196
+
197
+ #### Added
198
+ - **Initial Vector RAG System**: First version of AllyCAT
199
+ - Basic RAG implementation
200
+ - Vector database for document storage and retrieval
201
+ - Query system for document Q&A
202
+ - **Flask Web Interface**: Web-based chat interface
203
+ - **Basic Crawling**: Initial website crawling functionality
204
+ - **Document Processing**: Basic document processing pipeline
205
+
206
+
207
+
208
+ ---
Dockerfile ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ # Build arguments for conditional installation
4
+ ARG INSTALL_OLLAMA=false
5
+ ARG INSTALL_LOCAL_VECTOR_DB=false
6
+
7
+ # Set working directory
8
+ WORKDIR /allycat
9
+
10
+ # Set environment variables - Cloud-first defaults
11
+ ENV PYTHONDONTWRITEBYTECODE=1 \
12
+ PYTHONUNBUFFERED=1 \
13
+ LLM_RUN_ENV=cloud \
14
+ VECTOR_DB_TYPE=cloud_zilliz
15
+
16
+
17
+ # Install minimal system dependencies
18
+ RUN apt-get update && apt-get install -y --no-install-recommends \
19
+ bash \
20
+ curl \
21
+ git \
22
+ netcat-traditional \
23
+ && apt-get clean \
24
+ && rm -rf /var/lib/apt/lists/*
25
+
26
+ # Copy requirements file - Use cloud-optimized by default
27
+ COPY requirements-docker-cloud.txt .
28
+
29
+ # Install Python dependencies
30
+ RUN pip install --no-cache-dir -r requirements-docker-cloud.txt
31
+
32
+ # Conditional: Install Ollama only if requested
33
+ RUN if [ "$INSTALL_OLLAMA" = "true" ]; then \
34
+ echo "Installing Ollama for local LLM support..."; \
35
+ curl -fsSL https://ollama.com/install.sh | sh; \
36
+ else \
37
+ echo "Skipping Ollama installation - using cloud LLM mode"; \
38
+ fi
39
+
40
+ # Conditional: Install local vector DB dependencies
41
+ RUN if [ "$INSTALL_LOCAL_VECTOR_DB" = "true" ]; then \
42
+ echo "Installing milvus-lite for local vector database..."; \
43
+ pip install --no-cache-dir milvus-lite==2.4.11; \
44
+ else \
45
+ echo "Skipping local vector DB - using Zilliz Cloud"; \
46
+ fi
47
+
48
+ # Copy project files
49
+ COPY . .
50
+ RUN chmod +x ./docker-startup.sh
51
+
52
+ # Cleanup unnecessary files
53
+ RUN rm -rf .env workspace/* __pycache__ *.pyc
54
+
55
+ # Expose all application ports (EXPOSE doesn't support env variables at build time)
56
+ # Port 8080 = FLASK_GRAPH_PORT (default) / DOCKER_APP_PORT (default)
57
+ # Port 8081 = FLASK_VECTOR_PORT (default)
58
+ # Port 8082 = CHAINLIT_VECTOR_PORT (default)
59
+ # Port 8083 = CHAINLIT_GRAPH_PORT (default)
60
+ # Port mapping controlled by docker-compose.yml: ${DOCKER_PORT}:${DOCKER_APP_PORT}
61
+ EXPOSE 8080 8081 8082 8083
62
+ # Port 11434 = OLLAMA_PORT (default) - only used if INSTALL_OLLAMA=true
63
+ EXPOSE 11434
64
+
65
+ ENTRYPOINT ["./docker-startup.sh"]
66
+ CMD ["deploy"]
Dockerfile-dev ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ # Build arguments for conditional installation
4
+ ARG INSTALL_OLLAMA=false
5
+ ARG INSTALL_LOCAL_VECTOR_DB=false
6
+
7
+ # Set working directory
8
+ WORKDIR /allycat
9
+
10
+ # Set environment variables - Cloud-first defaults
11
+ ENV PYTHONDONTWRITEBYTECODE=1 \
12
+ PYTHONUNBUFFERED=1 \
13
+ LLM_RUN_ENV=cloud \
14
+ VECTOR_DB_TYPE=cloud_zilliz
15
+
16
+
17
+ # Install minimal system dependencies
18
+ RUN apt-get update && apt-get install -y --no-install-recommends \
19
+ bash \
20
+ curl \
21
+ git \
22
+ netcat-traditional \
23
+ && apt-get clean \
24
+ && rm -rf /var/lib/apt/lists/*
25
+
26
+ # Copy requirements file - Use cloud-optimized by default
27
+ COPY requirements-docker-cloud.txt .
28
+
29
+ # Install Python dependencies
30
+ RUN pip install --no-cache-dir -r requirements-docker-cloud.txt
31
+
32
+ # Conditional: Install Ollama only if requested
33
+ RUN if [ "$INSTALL_OLLAMA" = "true" ]; then \
34
+ echo "Installing Ollama for local LLM support..."; \
35
+ curl -fsSL https://ollama.com/install.sh | sh; \
36
+ else \
37
+ echo "Skipping Ollama installation - using cloud LLM mode"; \
38
+ fi
39
+
40
+ # Conditional: Install local vector DB dependencies
41
+ RUN if [ "$INSTALL_LOCAL_VECTOR_DB" = "true" ]; then \
42
+ echo "Installing milvus-lite for local vector database..."; \
43
+ pip install --no-cache-dir milvus-lite==2.4.11; \
44
+ else \
45
+ echo "Skipping local vector DB - using Zilliz Cloud"; \
46
+ fi
47
+
48
+ # Copy project files
49
+ COPY . .
50
+ RUN chmod +x ./docker-startup.sh
51
+
52
+ # Cleanup unnecessary files
53
+ RUN rm -rf .env workspace/* __pycache__ *.pyc
54
+
55
+ # Expose all application ports (EXPOSE doesn't support env variables at build time)
56
+ # Port 8080 = FLASK_GRAPH_PORT (default) / DOCKER_APP_PORT (default)
57
+ # Port 8081 = FLASK_VECTOR_PORT (default)
58
+ # Port 8082 = CHAINLIT_VECTOR_PORT (default)
59
+ # Port 8083 = CHAINLIT_GRAPH_PORT (default)
60
+ # Port mapping controlled by docker-compose.yml: ${DOCKER_PORT}:${DOCKER_APP_PORT}
61
+ EXPOSE 8080 8081 8082 8083
62
+ # Port 11434 = OLLAMA_PORT (default) - only used if INSTALL_OLLAMA=true
63
+ EXPOSE 11434
64
+
65
+ ENTRYPOINT ["./docker-startup.sh"]
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
README.md CHANGED
@@ -1,10 +1,75 @@
1
- ---
2
- title: Allycat
3
- emoji: 🐨
4
- colorFrom: green
5
- colorTo: gray
6
- sdk: docker
7
- pinned: false
8
- ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <img src="assets/allycat.png" alt="Alley Cat" width="200"/>
2
+
3
+ [![License](https://img.shields.io/github/license/The-AI-Alliance/allycat)](https://github.com/The-AI-Alliance/allycat/blob/main/LICENSE)
4
+ [![Issues](https://img.shields.io/github/issues/The-AI-Alliance/allycat)](https://github.com/The-AI-Alliance/allycat/issues)
5
+ ![GitHub stars](https://img.shields.io/github/stars/The-AI-Alliance/allycat?style=social)
6
+
7
+ # AllyCat
8
+
9
+ **AllyCat** is a full stack, open source chatbot that uses GenAI LLMs to answer questions about your website with both traditional Vector RAG and advanced GraphRAG capabilities. It is simple by design and will run on your laptop or server.
10
+
11
+ ## Why?
12
+
13
+ AllyCat is purposefully simple so it can be used by developers to learn how RAG-based GenAI works. Yet it is powerful enough to use with your website. You may also extend it for your own purposes.
14
+
15
+ ⭐ **Found this tool helpful? Give it a star on GitHub to support the project and help others discover it!**
16
+
17
+ **🗞️ [Allycat news](news.md)** - releases and new features!
18
+
19
+ ## How does it work?
20
+ AllyCat uses your choice of LLM and vector database to implement a chatbot written in Python using [RAG](https://en.wikipedia.org/wiki/Retrieval-augmented_generation) architecture.
21
+ AllyCat also includes web scraping tools that extract data from your website (or any website).
22
+
23
+ ## 🌟🌟 Features 🌟🌟
24
+
25
+ 1. **Dual RAG Modes:** Traditional Vector RAG and Advanced GraphRAG with entity extraction and community detection
26
+ 2. **Web Crawling & Scraping:** Text extraction, data/HTML processing, conversion to markdown.
27
+ - **Currently uses:** [Data Prep Kit Connector](https://github.com/data-prep-kit/data-prep-kit/blob/dev/data-connector-lib/doc/overview.md) and [Docling](https://github.com/docling-project/docling)
28
+ 3. **Processing:** Chunking, vector embedding creation, saving to vector database.
29
+ - **Currently uses:** [Llama Index](https://docs.llamaindex.ai/en/stable/) and [Granite Embedding Model](https://huggingface.co/ibm-granite/granite-embedding-30m-english)
30
+ 4. **Multiple LLM Support:**
31
+ - **Local:** [Ollama](https://ollama.com/) with [Llama](https://www.llama.com) or [Granite](https://huggingface.co/collections/ibm-granite/granite-33-language-models-67f65d0cca24bcbd1d3a08e3)
32
+ - **Cloud:** OpenAI, Cerebras, Google Gemini, Replicate, Nebius, and more via [LiteLLM](https://docs.litellm.ai/docs)
33
+ 5. **Multiple Database Support:**
34
+ - **Vector:** [Milvus](https://milvus.io/) (local/embedded) or [Zilliz Cloud](https://zilliz.com/)
35
+ - **Graph:** [Neo4j](https://neo4j.com/) for GraphRAG knowledge graphs
36
+ 6. **Flexible Deployment:** Docker support with 3 modes (Cloud, Hybrid, Local) + Native Python
37
+ 7. **Chatbot Interfaces:** Flask web UI and Chainlit chat interface
38
+
39
+ ## ⚡️⚡️Quickstart ⚡️⚡️
40
+
41
+ There are two ways to run Allycat.
42
+
43
+ ### Option 1: Use the Docker image
44
+
45
+ A great option for a quick evaluation.
46
+ See [running AllyCat using docker](docs/running-in-docker.md)
47
+
48
+ ### Option 2: Run natively (for tweaking, developing)
49
+
50
+ Choose this option if you want to tweak AllyCat to fit your needs. For example, experimenting with embedding models or LLMs.
51
+ See [running AllyCat natively](docs/running-natively.md)
52
+
53
+ ## AllyCat Workflow
54
+
55
+ ![](assets/rag-website-1.png)
56
+
57
+ See [running allycat](docs/running-allycat.md)
58
+
59
+ ## Customizing AllyCat
60
+
61
+ See [customizing allycat](docs/customizing-allycat.md)
62
+
63
+ ## Deploying AllyCat
64
+
65
+ See [deployment guide](docs/deploy.md)
66
+
67
+ ## Developing AllyCat
68
+
69
+ See [developing allycat](docs/developing-allycat.md)
70
+
71
+ ## Why the name **AllyCat**?
72
+
73
+ Originally AllianceChat, we shortened it to AllyCat when we learned chat means cat in French. Who doesn't love cats?!
74
+
75
+
app_chainlit.py ADDED
@@ -0,0 +1,299 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import chainlit as cl
2
+ import os
3
+ import logging
4
+ from dotenv import load_dotenv
5
+ import time
6
+ import asyncio
7
+ import re
8
+ import logging
9
+
10
+ # Import llama-index and related libraries
11
+ from llama_index.core import VectorStoreIndex, StorageContext
12
+ from llama_index.vector_stores.milvus import MilvusVectorStore
13
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
14
+ from llama_index.core import Settings
15
+ from llama_index.llms.litellm import LiteLLM
16
+ from my_config import MY_CONFIG
17
+ import query_utils
18
+
19
+ # Global variables for LLM and index
20
+ vector_index = None
21
+ initialization_complete = False
22
+
23
+ # Create logs directory if it doesn't exist
24
+ os.makedirs('logs/chainlit', exist_ok=True)
25
+
26
+ logging.basicConfig(level=logging.WARNING,
27
+ format='%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s',
28
+ handlers=[
29
+ logging.FileHandler('logs/chainlit/chainlit_vector.log', mode='a'),
30
+ logging.StreamHandler()
31
+ ],
32
+ force=True)
33
+ logger = logging.getLogger(__name__)
34
+ logger.setLevel(logging.INFO)
35
+
36
+
37
+ os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
38
+ # Load environment variables from .env file
39
+ load_dotenv()
40
+
41
+ def initialize():
42
+ """
43
+ Initialize LLM and Milvus vector database using llama-index.
44
+ This function sets up the necessary components for the chat application.
45
+ """
46
+ global vector_index, initialization_complete
47
+
48
+ if initialization_complete:
49
+ return
50
+
51
+ logger.info("Initializing LLM and vector database...")
52
+
53
+ # raise Exception ("init exception test") # debug
54
+
55
+ try:
56
+ ## embedding model
57
+ Settings.embed_model = HuggingFaceEmbedding(
58
+ model_name = MY_CONFIG.EMBEDDING_MODEL
59
+ )
60
+ logger.info(f"✅ Using embedding model: {MY_CONFIG.EMBEDDING_MODEL}")
61
+
62
+ # Setup LLM
63
+ logger.info(f"✅ Using LLM model : {MY_CONFIG.LLM_MODEL}")
64
+ Settings.llm = LiteLLM(
65
+ model=MY_CONFIG.LLM_MODEL,
66
+ )
67
+
68
+ # Initialize Milvus vector store
69
+ vector_store = MilvusVectorStore(
70
+ uri = MY_CONFIG.MILVUS_URI_VECTOR,
71
+ dim = MY_CONFIG.EMBEDDING_LENGTH ,
72
+ collection_name = MY_CONFIG.COLLECTION_NAME,
73
+ overwrite=False # so we load the index from db
74
+ )
75
+ storage_context = StorageContext.from_defaults(vector_store=vector_store)
76
+ logger.info (f"✅ Connected to Milvus instance: {MY_CONFIG.MILVUS_URI_VECTOR}")
77
+
78
+ vector_index = VectorStoreIndex.from_vector_store(
79
+ vector_store=vector_store, storage_context=storage_context)
80
+ logger.info (f"✅ Loaded index from vector db: {MY_CONFIG.MILVUS_URI_VECTOR}")
81
+
82
+ logger.info("Successfully initialized LLM and vector database")
83
+
84
+ initialization_complete = True
85
+ except Exception as e:
86
+ initialization_complete = False
87
+ logger.error(f"Error initializing LLM and vector database: {str(e)}")
88
+ raise (e)
89
+ # return False
90
+ ## -------------
91
+
92
+ def extract_thinking_section(response_text):
93
+ """
94
+ Extract thinking section from LLM response if present.
95
+
96
+ Args:
97
+ response_text (str): The full response from the LLM
98
+
99
+ Returns:
100
+ tuple: (thinking_content, cleaned_response)
101
+ - thinking_content: Content within <think></think> tags or None if not found
102
+ - cleaned_response: Response with thinking section removed
103
+ """
104
+ thinking_pattern = r'<think>(.*?)</think>'
105
+ match = re.search(thinking_pattern, response_text, re.DOTALL)
106
+
107
+ if match:
108
+ thinking_content = match.group(1).strip()
109
+ cleaned_response = re.sub(thinking_pattern, '', response_text, flags=re.DOTALL).strip()
110
+ return thinking_content, cleaned_response
111
+ else:
112
+ return None, response_text
113
+
114
+ async def get_llm_response(message):
115
+ """
116
+ Process the user message and get a response from the LLM using Vector RAG
117
+ with structured prompt
118
+ """
119
+ global vector_index, initialization_complete
120
+
121
+ # Check if LLM and index are initialized
122
+ if vector_index is None or initialization_complete is None:
123
+ return "System did not initialize. Please try again later.", 0
124
+
125
+ start_time = time.time()
126
+ response_text = ''
127
+
128
+ try:
129
+ # Step 1: Query preprocessing
130
+ async with cl.Step(name="Query Preprocessing", type="tool") as step:
131
+ logger.info("Start query preprocessing step...")
132
+ step.input = message
133
+
134
+ # Create a query engine from the index
135
+ query_engine = vector_index.as_query_engine()
136
+
137
+ # Preprocess the query
138
+ original_message = message
139
+ message = query_utils.tweak_query(message, MY_CONFIG.LLM_MODEL)
140
+
141
+ step.output = f"Optimized query: {message}"
142
+ ## --- end: Step 1 ---
143
+
144
+ # Query the index with structured prompting
145
+ logger.info("Calling LLM with structured prompting...")
146
+ t1 = time.time()
147
+
148
+ # Get initial vector response
149
+ vector_response = query_engine.query(message)
150
+ vector_text = str(vector_response).strip()
151
+
152
+ # Structured prompt
153
+ structured_prompt = f"""Please provide a comprehensive, well-structured answer using the provided document information.
154
+
155
+ Question: {message}
156
+
157
+ Document Information:
158
+ {vector_text}
159
+
160
+ Instructions:
161
+ 1. Provide accurate, factual information based on the documents
162
+ 2. Structure your response clearly with proper formatting
163
+ 3. Be comprehensive yet concise
164
+ 4. Highlight key relationships and important details when relevant
165
+ 5. Use bullet points or sections when appropriate for clarity
166
+
167
+ Please provide your answer:"""
168
+
169
+ # Use structured prompt for final synthesis
170
+ response = query_engine.query(structured_prompt)
171
+
172
+ t2 = time.time()
173
+ if response:
174
+ response_text = str(response).strip()
175
+ else:
176
+ response_text = "No response from LLM."
177
+ logger.info(f"LLM response received in {(t2 - t1):.2f} seconds:\n{response_text[:200]}")
178
+
179
+ # Step 2: Vector search and retrieval
180
+ async with cl.Step(name="Document Retrieval", type="retrieval") as step:
181
+ step.input = message
182
+
183
+ # Show retrieved documents
184
+ if hasattr(response, 'source_nodes') and response.source_nodes:
185
+ sources_output = []
186
+ for i, node in enumerate(response.source_nodes[:3]): # Show top 3 sources
187
+ score = node.score if hasattr(node, 'score') else 'N/A'
188
+ text_preview = node.text[:200] + "..." if len(node.text) > 200 else node.text
189
+ sources_output.append(f"Source {i+1} (Score: {score}): {text_preview}")
190
+ step.output = "\n\n".join(sources_output)
191
+ else:
192
+ step.output = "No relevant documents found."
193
+
194
+
195
+ # Extract thinking section if present
196
+ thinking_content, cleaned_response = extract_thinking_section(response_text)
197
+ # print (f"------ Thinking Content:-----\n{thinking_content}\n------") # Debug print
198
+ # print (f"------ Cleaned Response:-----\n{cleaned_response}\n------") # Debug print
199
+
200
+ # Step 3: Optional Thinking Process
201
+ if thinking_content:
202
+ async with cl.Step(name="💭 Thinking Process", type="run") as step:
203
+ step.input = ""
204
+ step.output = thinking_content
205
+ logger.info(f"Thinking:\n{thinking_content[:200]}...")
206
+
207
+ # Step 4: LLM Answer
208
+ async with cl.Step(name="Response", type="llm") as step:
209
+ step.input = f"Query: {message}\nContext: Retrieved from vector database"
210
+
211
+ if cleaned_response:
212
+ step.output = cleaned_response
213
+ logger.info(f"Response:\n{cleaned_response[:200]}...")
214
+ else:
215
+ step.output = "No response from LLM."
216
+ logger.info(f"Response:\nNo response from LLM.")
217
+
218
+ except Exception as e:
219
+ logger.error(f"Error getting LLM response: {str(e)}")
220
+ response_text = f"Sorry, I encountered an error while processing your request:\n{str(e)}"
221
+
222
+ end_time = time.time()
223
+ elapsed_time = end_time - start_time
224
+
225
+ return response_text, elapsed_time
226
+
227
+ ## --- end: def get_llm_response():
228
+
229
+ # ====== CHAINLIT SPECIFIC CODE ======
230
+
231
+ @cl.set_starters
232
+ async def set_starters():
233
+ starters = []
234
+ for prompt in MY_CONFIG.STARTER_PROMPTS:
235
+ starters.append(
236
+ cl.Starter(
237
+ label=prompt.strip(),
238
+ message=prompt.strip(),
239
+ )
240
+ )
241
+ return starters
242
+ ## --- end: def set_starters(): ---
243
+
244
+ @cl.on_chat_start
245
+ async def start():
246
+ """Initialize the chat session"""
247
+ # Store initialization state in user session
248
+ cl.user_session.set("chat_started", True)
249
+ logger.info("User chat session started")
250
+ init_error = None
251
+
252
+ try:
253
+ initialize()
254
+ # await cl.Message(content="How can I assist you today?").send()
255
+ except Exception as e:
256
+ init_error = str(e)
257
+ error_msg = f"""System Initialization Error
258
+
259
+ The system failed to initialize with the following error:
260
+
261
+ ```
262
+ {init_error}
263
+ ```
264
+
265
+ Please check your configuration and environment variables."""
266
+ await cl.Message(content=error_msg).send()
267
+
268
+ @cl.on_message
269
+ async def main(message: cl.Message):
270
+ """Handle incoming messages"""
271
+ user_message = message.content
272
+
273
+ # Get response from LLM with RAG steps shown FIRST
274
+ response_text, elapsed_time = await get_llm_response(user_message)
275
+ # logger.info(f"LLM Response:\n{response_text[:200]}...") # Log first 200 chars
276
+
277
+ thinking_content, cleaned_response = extract_thinking_section(response_text)
278
+
279
+ # Add timing stat to response
280
+ full_response = cleaned_response + f"\n\n⏱️ *Total time: {elapsed_time:.1f} seconds*"
281
+
282
+ # THEN create a new message for streaming
283
+ msg = cl.Message(content="")
284
+ await msg.send()
285
+
286
+ # Stream the response character by character for better UX
287
+ # This simulates streaming - in a real implementation you'd stream from the LLM
288
+ for i in range(0, len(full_response), 5): # Stream in chunks of 5 characters
289
+ await msg.stream_token(full_response[i:i+5])
290
+ await asyncio.sleep(0.01) # Small delay for visual effect
291
+
292
+ # Update the final message
293
+ msg.content = full_response
294
+ await msg.update()
295
+
296
+ ## -------
297
+ if __name__ == '__main__':
298
+ logger.info("App starting up...")
299
+ print(f"{'='*60}\n")
app_chainlit_graph.py ADDED
@@ -0,0 +1,375 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ GraphRAG Chainlit Application
3
+ """
4
+
5
+ import chainlit as cl
6
+ import os
7
+ import logging
8
+ from dotenv import load_dotenv
9
+ import time
10
+ import asyncio
11
+ import re
12
+ from typing import Dict, Any, Tuple
13
+
14
+ # Apply nest_asyncio to allow nested event loops
15
+ import nest_asyncio
16
+ nest_asyncio.apply()
17
+
18
+ # Import Step 1 functionality from setup module
19
+ from query_graph_functions.setup import create_graphrag_setup
20
+ # Import Steps 3-5 functionality from query preprocessing module
21
+ from query_graph_functions.query_preprocessing import create_query_preprocessor, preprocess_query_pipeline
22
+ # Import Steps 6-8 functionality from knowledge retrieval module
23
+ from query_graph_functions.knowledge_retrieval import CommunitySearchEngine
24
+ # Import Steps 9-12 functionality from follow-up search module
25
+ from query_graph_functions.follow_up_search import FollowUpSearch
26
+ # Import Steps 13-14 functionality from vector augmentation module
27
+ from query_graph_functions.vector_augmentation import VectorAugmentationEngine
28
+ # Import Steps 15-16 functionality from answer synthesis module
29
+ from query_graph_functions.answer_synthesis import AnswerSynthesisEngine
30
+ # Import Steps 17-20 functionality from response management module
31
+ from query_graph_functions.response_management import ResponseManager
32
+ from my_config import MY_CONFIG
33
+ import query_utils
34
+
35
+ # Configure environment
36
+ os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
37
+
38
+ # Load environment variables
39
+ load_dotenv()
40
+
41
+ # Create logs directory if it doesn't exist
42
+ os.makedirs('logs/chainlit', exist_ok=True)
43
+
44
+ # Configure logging - Save to file and console
45
+ logging.basicConfig(
46
+ level=logging.INFO,
47
+ format='%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s',
48
+ handlers=[
49
+ logging.FileHandler('logs/chainlit/chainlit_graph.log', mode='a'),
50
+ logging.StreamHandler()
51
+ ],
52
+ force=True
53
+ )
54
+ logger = logging.getLogger(__name__)
55
+ logger.setLevel(logging.INFO)
56
+
57
+ # Log session start
58
+ logger.info("=" * 80)
59
+ logger.info(f"Chainlit GraphRAG Session Started - {time.strftime('%Y-%m-%d %H:%M:%S')}")
60
+ logger.info("=" * 80)
61
+
62
+ # Global GraphRAG engine instance
63
+ graph_engine = None
64
+ initialization_complete = False
65
+
66
+ def initialize():
67
+ global graph_engine, initialization_complete
68
+
69
+ if initialization_complete:
70
+ return
71
+
72
+ logger.info("Initializing GraphRAG system...")
73
+
74
+ try:
75
+ # Initialize setup module (Step 1)
76
+ setup = create_graphrag_setup()
77
+
78
+ # Create GraphRAG engine wrapper
79
+ class GraphQueryEngine:
80
+ def __init__(self, setup):
81
+ self.setup = setup
82
+ self.neo4j_conn = setup.neo4j_conn
83
+ self.query_engine = setup.query_engine
84
+ self.graph_stats = setup.graph_stats
85
+ self.drift_config = setup.drift_config
86
+ self.llm = setup.llm
87
+ self.config = setup.config
88
+ self.query_preprocessor = None
89
+ self.response_manager = ResponseManager(setup)
90
+
91
+ async def run_query_async(self, user_query):
92
+ """Execute GraphRAG query pipeline (Steps 3-20)"""
93
+ start_time = time.time()
94
+
95
+ optimized_query = query_utils.tweak_query(user_query, MY_CONFIG.LLM_MODEL)
96
+
97
+ try:
98
+ if not self.setup.validate_system_readiness():
99
+ return {"answer": "System not ready", "metadata": {}}
100
+
101
+ # Initialize query preprocessor if needed
102
+ if not self.query_preprocessor:
103
+ self.query_preprocessor = await create_query_preprocessor(
104
+ self.config, self.graph_stats
105
+ )
106
+
107
+ # Phase B: Query Preprocessing (Steps 3-5)
108
+ analysis, routing, vectorization = await preprocess_query_pipeline(
109
+ optimized_query, self.config, self.graph_stats
110
+ )
111
+
112
+ # Phase C: Community Retrieval (Steps 6-7)
113
+ community_engine = CommunitySearchEngine(self.setup)
114
+ community_results = await community_engine.execute_primer_phase(
115
+ vectorization.embedding, routing
116
+ )
117
+
118
+ # Phase D: Follow-up Search (Steps 9-12)
119
+ follow_up_engine = FollowUpSearch(self.setup)
120
+ follow_up_results = await follow_up_engine.execute_follow_up_phase(
121
+ community_results, routing
122
+ )
123
+
124
+ # Phase E: Vector Search Augmentation (Steps 13-14)
125
+ vector_engine = VectorAugmentationEngine(self.setup)
126
+ augmentation_results = await vector_engine.execute_vector_augmentation_phase(
127
+ vectorization.embedding,
128
+ {'communities': community_results['communities'],
129
+ 'initial_answer': community_results['initial_answer'],
130
+ 'follow_up_results': follow_up_results},
131
+ routing
132
+ )
133
+
134
+ # Phase F: Answer Synthesis (Steps 15-16)
135
+ synthesis_engine = AnswerSynthesisEngine(self.setup)
136
+ synthesis_results = await synthesis_engine.execute_answer_synthesis_phase(
137
+ analysis, routing, community_results, follow_up_results, augmentation_results
138
+ )
139
+
140
+ total_time = time.time() - start_time
141
+
142
+ # Generate metadata
143
+ metadata = self.response_manager.generate_comprehensive_metadata(
144
+ analysis=analysis,
145
+ routing=routing,
146
+ vectorization=vectorization,
147
+ community_results=community_results,
148
+ follow_up_results=follow_up_results,
149
+ augmentation_results=augmentation_results,
150
+ synthesis_results=synthesis_results,
151
+ total_time=total_time
152
+ )
153
+
154
+ # Cleanup async tasks
155
+ await self.setup.cleanup_async_tasks(timeout=2.0)
156
+
157
+ return {
158
+ "answer": synthesis_results.final_answer,
159
+ "metadata": metadata,
160
+ "analysis": analysis,
161
+ "routing": routing,
162
+ "community_results": community_results,
163
+ "follow_up_results": follow_up_results,
164
+ "augmentation_results": augmentation_results
165
+ }
166
+
167
+ except Exception as e:
168
+ logger.error(f"Query pipeline error: {e}")
169
+ synthesis_engine = AnswerSynthesisEngine(self.setup)
170
+ return synthesis_engine.generate_error_response(f"Query error: {e}")
171
+
172
+ graph_engine = GraphQueryEngine(setup)
173
+
174
+ logger.info("✅ GraphRAG system initialized")
175
+ logger.info(f"✅ Using LLM: {MY_CONFIG.LLM_MODEL}")
176
+ logger.info(f"✅ Using embedding: {MY_CONFIG.EMBEDDING_MODEL}")
177
+
178
+ initialization_complete = True
179
+
180
+ except Exception as e:
181
+ initialization_complete = False
182
+ logger.error(f"GraphRAG initialization error: {str(e)}")
183
+ raise
184
+
185
+ def extract_thinking_section(response_text):
186
+ """
187
+ Extract thinking section from LLM response if present.
188
+
189
+ Args:
190
+ response_text (str): The full response from the LLM
191
+
192
+ Returns:
193
+ tuple: (thinking_content, cleaned_response)
194
+ - thinking_content: Content within <think></think> tags or None if not found
195
+ - cleaned_response: Response with thinking section removed
196
+ """
197
+ thinking_pattern = r'<think>(.*?)</think>'
198
+ match = re.search(thinking_pattern, response_text, re.DOTALL)
199
+
200
+ if match:
201
+ thinking_content = match.group(1).strip()
202
+ cleaned_response = re.sub(thinking_pattern, '', response_text, flags=re.DOTALL).strip()
203
+ return thinking_content, cleaned_response
204
+ else:
205
+ return None, response_text
206
+
207
+ async def get_llm_response(message):
208
+ """
209
+ Process user message
210
+ """
211
+ global graph_engine, initialization_complete
212
+
213
+ if not initialization_complete or graph_engine is None:
214
+ return "System not initialized. Please try again later.", 0
215
+
216
+ start_time = time.time()
217
+
218
+ try:
219
+ # Step 1: Query Preprocessing
220
+ async with cl.Step(name="Query Analysis", type="tool") as step:
221
+ step.input = message
222
+ optimized_query = query_utils.tweak_query(message, MY_CONFIG.LLM_MODEL)
223
+ step.output = f"Optimized query: {optimized_query}"
224
+
225
+ # Execute GraphRAG query pipeline
226
+ result = await graph_engine.run_query_async(message)
227
+
228
+ # Step 2: Community Search
229
+ if 'community_results' in result:
230
+ async with cl.Step(name="Community Retrieval", type="retrieval") as step:
231
+ communities = result['community_results'].get('communities', [])
232
+ step.input = "Searching graph communities"
233
+ step.output = f"Found {len(communities)} relevant communities"
234
+
235
+ # Step 3: Follow-up Search
236
+ if 'follow_up_results' in result:
237
+ async with cl.Step(name="Entity Search", type="retrieval") as step:
238
+ step.input = "Analyzing entity relationships"
239
+ follow_up = result['follow_up_results']
240
+
241
+ entities_found = len(follow_up.get('detailed_entities', []))
242
+ relationships_found = sum(~
243
+ len(search.traversed_relationships)
244
+ for search in follow_up.get('local_search_results', [])
245
+ )
246
+ step.output = f"Entities: {entities_found}, Relationships: {relationships_found}"
247
+
248
+ # Step 4: Vector Augmentation
249
+ if 'augmentation_results' in result:
250
+ async with cl.Step(name="Document Augmentation", type="retrieval") as step:
251
+ step.input = "Enriching with vector search"
252
+ aug_results = result['augmentation_results']
253
+
254
+ if hasattr(aug_results, 'vector_results'):
255
+ chunks = aug_results.vector_results
256
+ step.output = f"Retrieved {len(chunks)} relevant document chunks"
257
+ else:
258
+ step.output = "Vector augmentation completed"
259
+
260
+ # Extract answer and timing
261
+ full_response = result.get('answer', 'No response generated')
262
+
263
+ # Filter out metadata section more robustly
264
+ lines = full_response.split('\n')
265
+ filtered_lines = []
266
+
267
+ for line in lines:
268
+ stripped = line.strip()
269
+ # Skip these metadata lines completely
270
+ if (stripped.startswith('## Comprehensive Answer') or
271
+ stripped.startswith('# Comprehensive Answer') or
272
+ stripped.startswith('---') or
273
+ stripped.startswith('**Answer Confidence**:') or
274
+ stripped.startswith('**Sources Integrated**:') or
275
+ stripped.startswith('**Multi-Phase Coverage**:')):
276
+ continue
277
+
278
+ filtered_lines.append(line)
279
+
280
+ response_text = '\n'.join(filtered_lines).strip()
281
+
282
+ # Extract thinking section if present
283
+ thinking_content, cleaned_response = extract_thinking_section(response_text)
284
+
285
+ # Step 5: Optional Thinking Process
286
+ if thinking_content:
287
+ async with cl.Step(name="Reasoning Process", type="run") as step:
288
+ step.input = ""
289
+ step.output = thinking_content
290
+ logger.info(f"Thinking:\n{thinking_content[:200]}...")
291
+
292
+ # Step 6: Final Answer
293
+ async with cl.Step(name="Synthesis", type="llm") as step:
294
+ step.input = "Generating comprehensive answer"
295
+ step.output = cleaned_response if cleaned_response else response_text
296
+
297
+ end_time = time.time()
298
+ elapsed_time = end_time - start_time
299
+
300
+ return cleaned_response if cleaned_response else response_text, elapsed_time
301
+
302
+ except Exception as e:
303
+ logger.error(f"Error processing query: {str(e)}")
304
+ return f"Sorry, I encountered an error:\n{str(e)}", 0
305
+
306
+ # ====== CHAINLIT SPECIFIC CODE ======
307
+
308
+ @cl.set_starters
309
+ async def set_starters():
310
+ starters = []
311
+ for prompt in MY_CONFIG.STARTER_PROMPTS:
312
+ starters.append(
313
+ cl.Starter(
314
+ label=prompt.strip(),
315
+ message=prompt.strip(),
316
+ )
317
+ )
318
+ return starters
319
+ ## --- end: def set_starters(): ---
320
+
321
+ @cl.on_chat_start
322
+ async def start():
323
+ """Initialize the chat session"""
324
+ # Store initialization state in user session
325
+ cl.user_session.set("chat_started", True)
326
+ logger.info("User chat session started")
327
+ init_error = None
328
+
329
+ try:
330
+ initialize()
331
+ # await cl.Message(content="How can I assist you today?").send()
332
+ except Exception as e:
333
+ init_error = str(e)
334
+ error_msg = f"""System Initialization Error
335
+
336
+ The system failed to initialize with the following error:
337
+
338
+ ```
339
+ {init_error}
340
+ ```
341
+
342
+ Please check your configuration and environment variables."""
343
+ await cl.Message(content=error_msg).send()
344
+
345
+ @cl.on_message
346
+ async def main(message: cl.Message):
347
+ """Handle incoming messages"""
348
+ user_message = message.content
349
+
350
+ # Get response from LLM with RAG steps shown FIRST
351
+ response_text, elapsed_time = await get_llm_response(user_message)
352
+ # logger.info(f"LLM Response:\n{response_text[:200]}...") # Log first 200 chars
353
+
354
+ thinking_content, cleaned_response = extract_thinking_section(response_text)
355
+
356
+ # Add timing stat to response
357
+ full_response = cleaned_response + f"\n\n⏱️ *Total time: {elapsed_time:.1f} seconds*"
358
+
359
+ # THEN create a new message for streaming
360
+ msg = cl.Message(content="")
361
+ await msg.send()
362
+
363
+ # Stream the response character by character for better UX
364
+ # This simulates streaming - in a real implementation you'd stream from the LLM
365
+ for i in range(0, len(full_response), 5): # Stream in chunks of 5 characters
366
+ await msg.stream_token(full_response[i:i+5])
367
+ await asyncio.sleep(0.01) # Small delay for visual effect
368
+
369
+ # Update the final message
370
+ msg.content = full_response
371
+ await msg.update()
372
+
373
+ ## -------
374
+ if __name__ == '__main__':
375
+ logger.info("App starting up...")
app_flask.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, g, render_template, request, jsonify
2
+ import os
3
+ import logging
4
+ import time
5
+
6
+ # Import llama-index and related libraries
7
+ from llama_index.core import VectorStoreIndex, StorageContext
8
+ from llama_index.vector_stores.milvus import MilvusVectorStore
9
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
10
+ from llama_index.core import Settings
11
+ from llama_index.llms.litellm import LiteLLM
12
+ from my_config import MY_CONFIG
13
+ import query_utils
14
+
15
+
16
+ os.environ['HF_ENDPOINT'] = MY_CONFIG.HF_ENDPOINT
17
+
18
+
19
+
20
+ app = Flask(__name__)
21
+
22
+ # Global variables for LLM and index
23
+ vector_index = None
24
+
25
+ initialization_complete = False
26
+ def initialize():
27
+ """
28
+ Initialize LLM and Milvus vector database using llama-index.
29
+ This function sets up the necessary components for the chat application.
30
+ """
31
+ global vector_index, initialization_complete
32
+
33
+ if initialization_complete:
34
+ return
35
+
36
+ logging.info("Initializing LLM and vector database...")
37
+
38
+ # raise Exception ("init exception test") # debug
39
+
40
+ try:
41
+ ## embedding model
42
+ Settings.embed_model = HuggingFaceEmbedding(
43
+ model_name = MY_CONFIG.EMBEDDING_MODEL
44
+ )
45
+ print("✅ Using embedding model: ", MY_CONFIG.EMBEDDING_MODEL)
46
+
47
+ # Setup LLM using LiteLLM
48
+ llm = LiteLLM(
49
+ model=MY_CONFIG.LLM_MODEL,
50
+ temperature=0.1
51
+ )
52
+ print("✅ LLM run environment: ", MY_CONFIG.LLM_RUN_ENV)
53
+ print("✅ Using LLM model : ", MY_CONFIG.LLM_MODEL)
54
+ Settings.llm = llm
55
+
56
+ # Initialize Milvus vector store for Vector RAG only
57
+ vector_store = MilvusVectorStore(
58
+ uri = MY_CONFIG.MILVUS_URI_VECTOR , # Use dedicated Vector-only database
59
+ dim = MY_CONFIG.EMBEDDING_LENGTH ,
60
+ collection_name = MY_CONFIG.COLLECTION_NAME,
61
+ overwrite=False # so we load the index from db
62
+ )
63
+ storage_context = StorageContext.from_defaults(vector_store=vector_store)
64
+ print ("✅ Connected to Vector-only Milvus instance: ", MY_CONFIG.MILVUS_URI_VECTOR )
65
+
66
+ vector_index = VectorStoreIndex.from_vector_store(
67
+ vector_store=vector_store, storage_context=storage_context)
68
+ print ("✅ Loaded Vector-only index from:", MY_CONFIG.MILVUS_URI_VECTOR )
69
+
70
+ logging.info("Successfully initialized LLM and vector database")
71
+
72
+ initialization_complete = True
73
+ except Exception as e:
74
+ initialization_complete = False
75
+ logging.error(f"Error initializing LLM and vector database: {str(e)}")
76
+ raise (e)
77
+ # return False
78
+ ## -------------
79
+
80
+ ## ----
81
+ @app.route('/')
82
+ def index():
83
+ init_error = app.config.get('INIT_ERROR', '')
84
+ # init_error = g.get('init_error', None)
85
+ return render_template('index.html', init_error=init_error)
86
+ ## end --- def index():
87
+
88
+
89
+ ## -----
90
+ @app.route('/chat', methods=['POST'])
91
+ def chat():
92
+ user_message = request.json.get('message')
93
+
94
+
95
+ # Get response from LLM
96
+ response = get_llm_response(user_message)
97
+ # print (response)
98
+
99
+ return jsonify({'response': response})
100
+ ## end : def chat():
101
+
102
+
103
+ def get_llm_response(message):
104
+ """
105
+ Process the user message and get a response from the LLM using Vector RAG
106
+ with structured prompting
107
+ """
108
+ global vector_index, initialization_complete
109
+
110
+ # Check if LLM and index are initialized
111
+ if vector_index is None or initialization_complete is None:
112
+ return "System did not initialize. Please try again later."
113
+
114
+ start_time = time.time()
115
+ response_text = ''
116
+
117
+ try:
118
+ # raise Exception ("chat exception test") ## debug
119
+ # Create a query engine from the index
120
+ query_engine = vector_index.as_query_engine()
121
+
122
+ # Apply query optimization
123
+ message = query_utils.tweak_query(message, MY_CONFIG.LLM_MODEL)
124
+
125
+ # Get initial vector response
126
+ vector_response = query_engine.query(message)
127
+ vector_text = str(vector_response).strip()
128
+
129
+ # Structured prompt
130
+ structured_prompt = f"""Please provide a comprehensive, well-structured answer using the provided document information.
131
+
132
+ Question: {message}
133
+
134
+ Document Information:
135
+ {vector_text}
136
+
137
+ Instructions:
138
+ 1. Provide accurate, factual information based on the documents
139
+ 2. Structure your response clearly with proper formatting
140
+ 3. Be comprehensive yet concise
141
+ 4. Highlight key relationships and important details when relevant
142
+ 5. Use bullet points or sections when appropriate for clarity
143
+
144
+ Please provide your answer:"""
145
+
146
+ # Use structured prompt for final synthesis
147
+ final_response = query_engine.query(structured_prompt)
148
+
149
+ if final_response:
150
+ response_text = str(final_response).strip()
151
+
152
+ except Exception as e:
153
+ logging.error(f"Error getting LLM response: {str(e)}")
154
+ response_text = f"Sorry, I encountered an error while processing your request:\n{str(e)}"
155
+
156
+ end_time = time.time()
157
+
158
+ # add timing stat
159
+ response_text += f"\n⏱️ *Total time: {(end_time - start_time):.1f} seconds*"
160
+ return response_text
161
+
162
+ ## --- end: def get_llm_response():
163
+
164
+
165
+
166
+
167
+ ## -------
168
+ if __name__ == '__main__':
169
+ # Configure logging
170
+ logging.basicConfig(
171
+ level=logging.INFO,
172
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
173
+ )
174
+ logging.info("App starting up...")
175
+
176
+ # Initialize LLM and vector database
177
+ try:
178
+ initialize()
179
+ except Exception as e:
180
+ logging.warning("Starting without LLM and vector database. Responses will be limited.")
181
+ app.config['INIT_ERROR'] = str(e)
182
+ # g.init_error = str(e)
183
+
184
+
185
+ # Vector RAG Flask App - Configurable port via environment
186
+ PORT = MY_CONFIG.FLASK_VECTOR_PORT
187
+ print(f"🚀 Vector RAG Flask app starting on port {PORT}")
188
+ app.run(host="0.0.0.0", debug=False, port=PORT)
189
+ ## -- end main ----
app_flask_graph.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ GraphRAG Flask Web Application
3
+ """
4
+
5
+ from flask import Flask, render_template, request, jsonify
6
+ import os
7
+ import logging
8
+ import time
9
+ import asyncio
10
+ import nest_asyncio
11
+ from query_graph_functions.setup import create_graphrag_setup
12
+ from query_graph_functions.query_preprocessing import create_query_preprocessor, preprocess_query_pipeline
13
+ from query_graph_functions.knowledge_retrieval import CommunitySearchEngine
14
+ from query_graph_functions.follow_up_search import FollowUpSearch
15
+ from query_graph_functions.vector_augmentation import VectorAugmentationEngine
16
+ from query_graph_functions.answer_synthesis import AnswerSynthesisEngine
17
+ from query_graph_functions.response_management import ResponseManager
18
+ from my_config import MY_CONFIG
19
+ import query_utils
20
+
21
+
22
+ nest_asyncio.apply()
23
+ os.environ['HF_ENDPOINT'] = MY_CONFIG.HF_ENDPOINT
24
+
25
+ app = Flask(__name__)
26
+
27
+ # Global GraphRAG engine
28
+ graph_engine = None
29
+ initialization_complete = False
30
+
31
+ def initialize():
32
+ """
33
+ Initialize GraphRAG system
34
+ """
35
+ global graph_engine, initialization_complete
36
+
37
+ if initialization_complete:
38
+ return
39
+
40
+ logging.info("Initializing GraphRAG system...")
41
+
42
+ try:
43
+ # Initialize setup module (Step 1)
44
+ setup = create_graphrag_setup()
45
+
46
+ # Create GraphRAG engine wrapper
47
+ class GraphQueryEngine:
48
+ def __init__(self, setup):
49
+ self.setup = setup
50
+ self.neo4j_conn = setup.neo4j_conn
51
+ self.query_engine = setup.query_engine
52
+ self.graph_stats = setup.graph_stats
53
+ self.drift_config = setup.drift_config
54
+ self.llm = setup.llm
55
+ self.config = setup.config
56
+ self.query_preprocessor = None
57
+ self.response_manager = ResponseManager(setup)
58
+
59
+ async def run_query_async(self, user_query):
60
+ """Execute GraphRAG query pipeline (Steps 3-20)"""
61
+ start_time = time.time()
62
+
63
+ optimized_query = query_utils.tweak_query(user_query, MY_CONFIG.LLM_MODEL)
64
+
65
+ try:
66
+ if not self.setup.validate_system_readiness():
67
+ return {"answer": "System not ready", "metadata": {}}
68
+
69
+ # Initialize query preprocessor if needed
70
+ if not self.query_preprocessor:
71
+ self.query_preprocessor = await create_query_preprocessor(
72
+ self.config, self.graph_stats
73
+ )
74
+
75
+ # Phase B: Query Preprocessing (Steps 3-5)
76
+ analysis, routing, vectorization = await preprocess_query_pipeline(
77
+ optimized_query, self.config, self.graph_stats
78
+ )
79
+
80
+ # Phase C: Community Retrieval (Steps 6-7)
81
+ community_engine = CommunitySearchEngine(self.setup)
82
+ community_results = await community_engine.execute_primer_phase(
83
+ vectorization.embedding, routing
84
+ )
85
+
86
+ # Phase D: Follow-up Search (Steps 9-12)
87
+ follow_up_engine = FollowUpSearch(self.setup)
88
+ follow_up_results = await follow_up_engine.execute_follow_up_phase(
89
+ community_results, routing
90
+ )
91
+
92
+ # Phase E: Vector Search Augmentation (Steps 13-14)
93
+ vector_engine = VectorAugmentationEngine(self.setup)
94
+ augmentation_results = await vector_engine.execute_vector_augmentation_phase(
95
+ vectorization.embedding,
96
+ {'communities': community_results['communities'],
97
+ 'initial_answer': community_results['initial_answer'],
98
+ 'follow_up_results': follow_up_results},
99
+ routing
100
+ )
101
+
102
+ # Phase F: Answer Synthesis (Steps 15-16)
103
+ synthesis_engine = AnswerSynthesisEngine(self.setup)
104
+ synthesis_results = await synthesis_engine.execute_answer_synthesis_phase(
105
+ analysis, routing, community_results, follow_up_results, augmentation_results
106
+ )
107
+
108
+ total_time = time.time() - start_time
109
+
110
+ # Generate metadata
111
+ metadata = self.response_manager.generate_comprehensive_metadata(
112
+ analysis=analysis,
113
+ routing=routing,
114
+ vectorization=vectorization,
115
+ community_results=community_results,
116
+ follow_up_results=follow_up_results,
117
+ augmentation_results=augmentation_results,
118
+ synthesis_results=synthesis_results,
119
+ total_time=total_time
120
+ )
121
+
122
+ # Cleanup async tasks
123
+ await self.setup.cleanup_async_tasks(timeout=2.0)
124
+
125
+ return {
126
+ "answer": synthesis_results.final_answer,
127
+ "metadata": metadata
128
+ }
129
+
130
+ except Exception as e:
131
+ logging.error(f"Query pipeline error: {e}")
132
+ synthesis_engine = AnswerSynthesisEngine(self.setup)
133
+ return synthesis_engine.generate_error_response(f"Query error: {e}")
134
+
135
+ def run_query(self, user_query):
136
+ """Synchronous wrapper for async query"""
137
+ try:
138
+ loop = asyncio.get_event_loop()
139
+ return loop.run_until_complete(self.run_query_async(user_query))
140
+ except Exception as e:
141
+ logging.error(f"Query execution error: {e}")
142
+ return {"answer": f"Error: {e}", "metadata": {}}
143
+
144
+ graph_engine = GraphQueryEngine(setup)
145
+
146
+ print("✅ GraphRAG system initialized")
147
+ print(f"✅ Using LLM: {MY_CONFIG.LLM_MODEL}")
148
+ print(f"✅ Using embedding: {MY_CONFIG.EMBEDDING_MODEL}")
149
+
150
+ logging.info("GraphRAG system ready")
151
+ initialization_complete = True
152
+
153
+ except Exception as e:
154
+ initialization_complete = False
155
+ logging.error(f"GraphRAG initialization error: {str(e)}")
156
+ raise
157
+
158
+ ## ----
159
+ @app.route('/')
160
+ def index():
161
+ init_error = app.config.get('INIT_ERROR', '')
162
+ # init_error = g.get('init_error', None)
163
+ return render_template('index.html', init_error=init_error)
164
+ ## end --- def index():
165
+
166
+
167
+ ## ----
168
+ @app.route('/health')
169
+ def health():
170
+ """Health check endpoint for deployment platforms"""
171
+ if initialization_complete:
172
+ return jsonify({"status": "healthy", "graphrag": "initialized"}), 200
173
+ else:
174
+ return jsonify({"status": "initializing"}), 503
175
+ ## end --- def health():
176
+
177
+
178
+ ## -----
179
+ @app.route('/chat', methods=['POST'])
180
+ def chat():
181
+ user_message = request.json.get('message')
182
+
183
+
184
+ # Get response from LLM
185
+ response = get_llm_response(user_message)
186
+ # print (response)
187
+
188
+ return jsonify({'response': response})
189
+ ## end : def chat():
190
+
191
+
192
+ def get_llm_response(message):
193
+ """
194
+ Process user message using complete GraphRAG pipeline.
195
+ Implements the full 25-step DRIFT search methodology.
196
+ """
197
+ global graph_engine, initialization_complete
198
+
199
+ if not initialization_complete or graph_engine is None:
200
+ return "System not initialized. Please try again later."
201
+
202
+ start_time = time.time()
203
+
204
+ try:
205
+ # Execute GraphRAG query pipeline
206
+ result = graph_engine.run_query(message)
207
+
208
+ # Extract answer and timing
209
+ full_response = result.get('answer', 'No response generated')
210
+
211
+ # Filter out metadata section more robustly
212
+ lines = full_response.split('\n')
213
+ filtered_lines = []
214
+
215
+ for line in lines:
216
+ stripped = line.strip()
217
+ # Skip these metadata lines completely
218
+ if (stripped.startswith('## Comprehensive Answer') or
219
+ stripped.startswith('# Comprehensive Answer') or
220
+ stripped.startswith('---') or
221
+ stripped.startswith('**Answer Confidence**:') or
222
+ stripped.startswith('**Sources Integrated**:') or
223
+ stripped.startswith('**Multi-Phase Coverage**:')):
224
+ continue
225
+
226
+ filtered_lines.append(line)
227
+
228
+ response_text = '\n'.join(filtered_lines).strip()
229
+ end_time = time.time()
230
+
231
+ # Add timing information
232
+ response_text += f"\n\n⏱️ *Total time: {(end_time - start_time):.1f} seconds*"
233
+
234
+ return response_text
235
+
236
+ except Exception as e:
237
+ logging.error(f"Error processing query: {str(e)}")
238
+ return f"Sorry, I encountered an error:\n{str(e)}"
239
+
240
+
241
+
242
+
243
+ ## -------
244
+ if __name__ == '__main__':
245
+ # Configure logging
246
+ logging.basicConfig(
247
+ level=logging.INFO,
248
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
249
+ )
250
+ logging.info("App starting up...")
251
+
252
+ # Initialize LLM and vector database
253
+ try:
254
+ initialize()
255
+ except Exception as e:
256
+ logging.warning("Starting without LLM and vector database. Responses will be limited.")
257
+ app.config['INIT_ERROR'] = str(e)
258
+ # g.init_error = str(e)
259
+
260
+
261
+ # GraphRAG Flask App - Configurable port via environment
262
+ PORT = MY_CONFIG.FLASK_GRAPH_PORT
263
+ print(f"🚀 GraphRAG Flask app starting on port {PORT}")
264
+ app.run(host="0.0.0.0", debug=False, port=PORT)
chainlit.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Welcome to Chainlit! 🚀🤖
2
+
3
+ Hi there, Developer! 👋 We're excited to have you on board. Chainlit is a powerful tool designed to help you prototype, debug and share applications built on top of LLMs.
4
+
5
+ ## Useful Links 🔗
6
+
7
+ - **Documentation:** Get started with our comprehensive [Chainlit Documentation](https://docs.chainlit.io) 📚
8
+ - **Discord Community:** Join our friendly [Chainlit Discord](https://discord.gg/k73SQ3FyUh) to ask questions, share your projects, and connect with other developers! 💬
9
+
10
+ We can't wait to see what you create with Chainlit! Happy coding! 💻😊
11
+
12
+ ## Welcome screen
13
+
14
+ To modify the welcome screen, edit the `chainlit.md` file at the root of your project. If you do not want a welcome screen, just leave this file empty.
cleanup_pipeline_deps.sh ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # ============================================
3
+ # Cleanup Pipeline Dependencies Script
4
+ # ============================================
5
+ # This script removes heavy packages that are only needed for the pipeline
6
+ # to save ~350-500 MB of RAM in production Docker containers.
7
+ #
8
+ # Run this AFTER the pipeline completes successfully.
9
+ # ============================================
10
+
11
+ echo "============================================"
12
+ echo "Starting Pipeline Dependency Cleanup"
13
+ echo "============================================"
14
+ echo "This will remove packages only needed for:"
15
+ echo " - Document processing (docling, html2text)"
16
+ echo " - Graph community detection (igraph, leidenalg, etc.)"
17
+ echo " - Development tools (ipykernel, tqdm, etc.)"
18
+ echo ""
19
+ echo "Estimated RAM savings: 350-500 MB"
20
+ echo "============================================"
21
+
22
+ # Document processing packages
23
+ echo "Removing document processing packages..."
24
+ pip uninstall -y docling html2text 2>/dev/null || echo " (already removed or not installed)"
25
+
26
+ # Graph community detection packages
27
+ echo "Removing graph community detection packages..."
28
+ pip uninstall -y python-louvain igraph leidenalg graspologic 2>/dev/null || echo " (already removed or not installed)"
29
+
30
+ # Development tools
31
+ echo "Removing development tools..."
32
+ pip uninstall -y tqdm ipykernel fastmcp 2>/dev/null || echo " (already removed or not installed)"
33
+
34
+ # Milvus Lite (if using cloud Zilliz)
35
+ if [ "$VECTOR_DB_TYPE" = "cloud_zilliz" ]; then
36
+ echo "Removing Milvus Lite (using cloud Zilliz)..."
37
+ pip uninstall -y milvus-lite 2>/dev/null || echo " (already removed or not installed)"
38
+ fi
39
+
40
+ # Chainlit (if using Flask only)
41
+ if [ "$APP_TYPE" = "flask_graph" ] || [ "$APP_TYPE" = "flask" ]; then
42
+ echo "Removing Chainlit (using Flask app)..."
43
+ pip uninstall -y chainlit 2>/dev/null || echo " (already removed or not installed)"
44
+ fi
45
+
46
+ echo ""
47
+ echo "============================================"
48
+ echo "Cleanup Complete!"
49
+ echo "============================================"
50
+ echo "Before cleanup: ~800 MB"
51
+ echo "After cleanup: ~300-450 MB (depending on config)"
52
+ echo ""
53
+ echo "Note: If you redeploy and AUTO_RUN_PIPELINE=true,"
54
+ echo " all packages will be reinstalled automatically."
55
+ echo "============================================"
docker-compose.cloud.yml ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.8'
2
+
3
+ services:
4
+ allycat-graphrag-cloud:
5
+ image: allycat-graphrag:cloud
6
+ build:
7
+ context: .
8
+ dockerfile: Dockerfile
9
+ args:
10
+ INSTALL_OLLAMA: "false"
11
+ INSTALL_LOCAL_VECTOR_DB: "false"
12
+ container_name: allycat-cloud
13
+ ports:
14
+ - "${DOCKER_PORT:-8080}:${DOCKER_APP_PORT:-8080}"
15
+ environment:
16
+ - LLM_RUN_ENV=cloud
17
+ - VECTOR_DB_TYPE=cloud_zilliz
18
+ - APP_TYPE=${APP_TYPE:-flask_graph}
19
+ - AUTO_RUN_PIPELINE=${AUTO_RUN_PIPELINE:-false}
20
+ - DOCKER_APP_PORT=${DOCKER_APP_PORT:-8080}
21
+ - FLASK_GRAPH_PORT=${FLASK_GRAPH_PORT:-8080}
22
+ - FLASK_VECTOR_PORT=${FLASK_VECTOR_PORT:-8081}
23
+ - CHAINLIT_GRAPH_PORT=${CHAINLIT_GRAPH_PORT:-8083}
24
+ - CHAINLIT_VECTOR_PORT=${CHAINLIT_VECTOR_PORT:-8082}
25
+ env_file:
26
+ - .env
27
+ command: ["deploy"]
28
+ restart: unless-stopped
29
+ healthcheck:
30
+ test: ["CMD", "curl", "-f", "http://localhost:${DOCKER_APP_PORT:-8080}"]
31
+ interval: 60s
32
+ timeout: 60s
33
+ retries: 1
34
+ start_period: 1500s
docker-compose.hybrid.yml ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.8'
2
+
3
+ services:
4
+ allycat-graphrag-hybrid:
5
+ image: allycat-graphrag:hybrid
6
+ build:
7
+ context: .
8
+ dockerfile: Dockerfile
9
+ args:
10
+ INSTALL_OLLAMA: "false"
11
+ INSTALL_LOCAL_VECTOR_DB: "true"
12
+ container_name: allycat-hybrid
13
+ ports:
14
+ - "${DOCKER_PORT:-8080}:${DOCKER_APP_PORT:-8080}"
15
+ environment:
16
+ - LLM_RUN_ENV=cloud
17
+ - VECTOR_DB_TYPE=local
18
+ - APP_TYPE=${APP_TYPE:-flask_graph}
19
+ - AUTO_RUN_PIPELINE=${AUTO_RUN_PIPELINE:-false}
20
+ - DOCKER_APP_PORT=${DOCKER_APP_PORT:-8080}
21
+ - FLASK_GRAPH_PORT=${FLASK_GRAPH_PORT:-8080}
22
+ - FLASK_VECTOR_PORT=${FLASK_VECTOR_PORT:-8081}
23
+ - CHAINLIT_GRAPH_PORT=${CHAINLIT_GRAPH_PORT:-8083}
24
+ - CHAINLIT_VECTOR_PORT=${CHAINLIT_VECTOR_PORT:-8082}
25
+ env_file:
26
+ - .env
27
+ volumes:
28
+ - ./workspace:/allycat/workspace
29
+ command: ["deploy"]
30
+ restart: unless-stopped
31
+ healthcheck:
32
+ test: ["CMD", "curl", "-f", "http://localhost:${DOCKER_APP_PORT:-8080}"]
33
+ interval: 60s
34
+ timeout: 60s
35
+ retries: 1
36
+ start_period: 1500s
docker-compose.local.yml ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.8'
2
+
3
+ services:
4
+ allycat-graphrag-local:
5
+ image: allycat-graphrag:local
6
+ build:
7
+ context: .
8
+ dockerfile: Dockerfile
9
+ args:
10
+ INSTALL_OLLAMA: "true"
11
+ INSTALL_LOCAL_VECTOR_DB: "true"
12
+ container_name: allycat-local
13
+ ports:
14
+ - "${DOCKER_PORT:-8080}:${DOCKER_APP_PORT:-8080}"
15
+ - "${OLLAMA_PORT:-11434}:11434"
16
+ environment:
17
+ - LLM_RUN_ENV=local_ollama
18
+ - VECTOR_DB_TYPE=local
19
+ - APP_TYPE=${APP_TYPE:-flask_graph}
20
+ - AUTO_RUN_PIPELINE=${AUTO_RUN_PIPELINE:-false}
21
+ - DOCKER_APP_PORT=${DOCKER_APP_PORT:-8080}
22
+ - FLASK_GRAPH_PORT=${FLASK_GRAPH_PORT:-8080}
23
+ - FLASK_VECTOR_PORT=${FLASK_VECTOR_PORT:-8081}
24
+ - CHAINLIT_GRAPH_PORT=${CHAINLIT_GRAPH_PORT:-8083}
25
+ - CHAINLIT_VECTOR_PORT=${CHAINLIT_VECTOR_PORT:-8082}
26
+ - OLLAMA_PORT=${OLLAMA_PORT:-11434}
27
+ env_file:
28
+ - .env
29
+ volumes:
30
+ - ./workspace:/allycat/workspace
31
+ command: ["deploy"]
32
+ restart: unless-stopped
33
+ healthcheck:
34
+ test: ["CMD", "curl", "-f", "http://localhost:${DOCKER_APP_PORT:-8080}"]
35
+ interval: 60s
36
+ timeout: 60s
37
+ retries: 1
38
+ start_period: 1500s
docker-startup.sh ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ echo "=== AllyCAT GraphRAG Docker Startup ==="
4
+
5
+ # Check deployment mode from environment
6
+ LLM_MODE=${LLM_RUN_ENV:-cloud}
7
+ VECTOR_MODE=${VECTOR_DB_TYPE:-cloud_zilliz}
8
+
9
+ echo "LLM Mode: $LLM_MODE"
10
+ echo "Vector DB Mode: $VECTOR_MODE"
11
+
12
+ # Conditional: Start Ollama only if in local mode
13
+ if [ "$LLM_MODE" = "local_ollama" ]; then
14
+ echo "Starting Ollama in local mode..."
15
+
16
+ # Define OLLAMA_MODELS dir
17
+ if [ -z "$OLLAMA_MODELS" ]; then
18
+ export OLLAMA_MODELS=/allycat/workspace/ollama
19
+ fi
20
+
21
+ echo "Env variables for OLLAMA:"
22
+ env | grep OLLAMA
23
+
24
+ # Start ollama
25
+ ollama_model=${OLLAMA_MODEL:-gemma3:1b}
26
+ echo "Starting Ollama server..."
27
+ ollama serve > /allycat/ollama-serve.out 2>&1 &
28
+
29
+ # Wait for ollama to start
30
+ OLLAMA_PORT=${OLLAMA_PORT:-11434}
31
+ while ! nc -z localhost $OLLAMA_PORT; do
32
+ sleep 1
33
+ done
34
+ echo "✅ Ollama started on port $OLLAMA_PORT"
35
+
36
+ # Only download the model if we are in DEPLOY mode
37
+ if [ "$1" == "deploy" ]; then
38
+ echo "Downloading Ollama model: $ollama_model"
39
+ ollama pull $ollama_model
40
+ echo "✅ Ollama model downloaded: $ollama_model"
41
+ fi
42
+ else
43
+ echo "✅ Using cloud LLM mode - Ollama not started"
44
+ fi
45
+
46
+ # Conditional: Setup local vector DB only if needed
47
+ if [ "$VECTOR_MODE" = "local" ]; then
48
+ echo "Setting up local Milvus vector database..."
49
+ mkdir -p /allycat/workspace
50
+ echo "✅ Local vector database directory created"
51
+ else
52
+ echo "✅ Using Zilliz Cloud for vector database"
53
+ fi
54
+
55
+ # Run GraphRAG pipeline if AUTO_RUN_PIPELINE is enabled and in deploy mode
56
+ if [ "$1" == "deploy" ] && [ "${AUTO_RUN_PIPELINE:-false}" = "true" ]; then
57
+ echo ""
58
+ echo "=== Running GraphRAG Pipeline Automatically ==="
59
+ echo ""
60
+
61
+ # Step 1: Crawl website
62
+ if [ -n "$WEBSITE_URL" ]; then
63
+ echo "Step 1/5: Crawling website: $WEBSITE_URL"
64
+ python3 1_crawl_site.py || echo "⚠️ Warning: Crawl failed, continuing..."
65
+ echo "✅ Step 1 complete"
66
+ echo ""
67
+ else
68
+ echo "⚠️ Skipping crawl - WEBSITE_URL not set"
69
+ fi
70
+
71
+ # Step 2: Process files to markdown
72
+ echo "Step 2/5: Processing files to markdown..."
73
+ python3 2_process_files.py || echo "⚠️ Warning: Processing failed, continuing..."
74
+ echo "✅ Step 2 complete"
75
+ echo ""
76
+
77
+ # Step 3: Save to vector database
78
+ echo "Step 3/5: Saving to vector database..."
79
+ if [ "$VECTOR_MODE" = "cloud_zilliz" ]; then
80
+ python3 3_save_to_vector_db_zilliz.py || echo "⚠️ Warning: Vector DB save failed, continuing..."
81
+ else
82
+ python3 3_save_to_vector_db.py || echo "⚠️ Warning: Vector DB save failed, continuing..."
83
+ fi
84
+ echo "✅ Step 3 complete"
85
+ echo ""
86
+
87
+ # Step 4: Process graph data (3 phases)
88
+ echo "Step 4/5: Processing graph data (3 phases)..."
89
+ echo " Phase 1: Extracting entities and relationships..."
90
+ python3 2b_process_graph_phase1.py || echo "⚠️ Warning: Phase 1 failed, continuing..."
91
+ echo " Phase 2: Building communities..."
92
+ python3 2b_process_graph_phase2.py || echo "⚠️ Warning: Phase 2 failed, continuing..."
93
+ echo " Phase 3: Generating community summaries..."
94
+ python3 2b_process_graph_phase3.py || echo "⚠️ Warning: Phase 3 failed, continuing..."
95
+ echo "✅ Step 4 complete"
96
+ echo ""
97
+
98
+ # Step 5: Save to graph database
99
+ echo "Step 5/5: Saving to graph database..."
100
+ python3 3b_save_to_graph_db.py || echo "⚠️ Warning: Graph DB save failed, continuing..."
101
+ echo "✅ Step 5 complete"
102
+ echo ""
103
+
104
+ echo "=== ✅ Pipeline Complete - Starting Application ==="
105
+ echo ""
106
+
107
+ # OPTIMIZATION: Clean up pipeline dependencies to save RAM
108
+ if [ "${CLEANUP_PIPELINE_DEPS:-false}" = "true" ]; then
109
+ echo ""
110
+ echo "=== 🧹 Cleaning Up Pipeline Dependencies ==="
111
+ echo "This will save ~350-500 MB of RAM"
112
+ echo ""
113
+ chmod +x ./cleanup_pipeline_deps.sh
114
+ ./cleanup_pipeline_deps.sh
115
+ echo ""
116
+ echo "=== ✅ Cleanup Complete ==="
117
+ echo ""
118
+ else
119
+ echo ""
120
+ echo "💡 TIP: Set CLEANUP_PIPELINE_DEPS=true in .env to save ~350-500 MB RAM"
121
+ echo " after pipeline completes (reduces OOM errors on 1GB containers)"
122
+ echo ""
123
+ fi
124
+ fi
125
+
126
+ # Start the appropriate web application
127
+ APP_TYPE=${APP_TYPE:-flask_graph}
128
+ DOCKER_APP_PORT=${DOCKER_APP_PORT:-8080}
129
+ FLASK_GRAPH_PORT=${FLASK_GRAPH_PORT:-8080}
130
+ FLASK_VECTOR_PORT=${FLASK_VECTOR_PORT:-8081}
131
+ CHAINLIT_GRAPH_PORT=${CHAINLIT_GRAPH_PORT:-8083}
132
+ CHAINLIT_VECTOR_PORT=${CHAINLIT_VECTOR_PORT:-8082}
133
+
134
+ # Log port configuration
135
+ echo ""
136
+ echo "=== Port Configuration ==="
137
+ echo "DOCKER_APP_PORT (internal container): $DOCKER_APP_PORT"
138
+ echo "FLASK_GRAPH_PORT: $FLASK_GRAPH_PORT"
139
+ echo "FLASK_VECTOR_PORT: $FLASK_VECTOR_PORT"
140
+ echo "CHAINLIT_GRAPH_PORT: $CHAINLIT_GRAPH_PORT"
141
+ echo "CHAINLIT_VECTOR_PORT: $CHAINLIT_VECTOR_PORT"
142
+ echo ""
143
+
144
+ # Determine which port will be used based on APP_TYPE
145
+ case $APP_TYPE in
146
+ "flask_graph")
147
+ APP_PORT=$FLASK_GRAPH_PORT
148
+ ;;
149
+ "chainlit_graph")
150
+ APP_PORT=$CHAINLIT_GRAPH_PORT
151
+ ;;
152
+ "flask")
153
+ APP_PORT=$FLASK_VECTOR_PORT
154
+ ;;
155
+ "chainlit")
156
+ APP_PORT=$CHAINLIT_VECTOR_PORT
157
+ ;;
158
+ *)
159
+ APP_PORT=$FLASK_GRAPH_PORT
160
+ ;;
161
+ esac
162
+
163
+ echo "Selected APP_TYPE: $APP_TYPE will run on port: $APP_PORT"
164
+ echo "Container will expose application on port: $DOCKER_APP_PORT (mapped to host DOCKER_PORT)"
165
+ echo ""
166
+
167
+ if [ "$1" == "deploy" ]; then
168
+ echo "In deploy mode..."
169
+
170
+ case $APP_TYPE in
171
+ "flask_graph")
172
+ echo "Starting Flask GraphRAG app on port $FLASK_GRAPH_PORT..."
173
+ python3 app_flask_graph.py
174
+ ;;
175
+ "chainlit_graph")
176
+ echo "Starting Chainlit GraphRAG app on port $CHAINLIT_GRAPH_PORT..."
177
+ chainlit run app_chainlit_graph.py --host 0.0.0.0 --port $CHAINLIT_GRAPH_PORT
178
+ ;;
179
+ "flask")
180
+ echo "Starting Flask Vector RAG app on port $FLASK_VECTOR_PORT..."
181
+ python3 app_flask.py
182
+ ;;
183
+ "chainlit")
184
+ echo "Starting Chainlit Vector RAG app on port $CHAINLIT_VECTOR_PORT..."
185
+ chainlit run app_chainlit.py --host 0.0.0.0 --port $CHAINLIT_VECTOR_PORT
186
+ ;;
187
+ *)
188
+ echo "Starting default Flask GraphRAG app on port $FLASK_GRAPH_PORT..."
189
+ python3 app_flask_graph.py
190
+ ;;
191
+ esac
192
+ else
193
+ echo "Not in deploy mode, entering interactive shell."
194
+ echo ""
195
+ echo "Available commands:"
196
+ echo " python3 app_flask_graph.py - Start Flask GraphRAG app"
197
+ echo " python3 app_flask.py - Start Flask VectorRAG app"
198
+ echo " chainlit run app_chainlit_graph.py - Start Chainlit GraphRAG app"
199
+ echo " chainlit run app_chainlit.py - Start Chainlit VectorRAG app"
200
+
201
+ if [ "$LLM_MODE" = "local_ollama" ]; then
202
+ echo " ollama pull $ollama_model - Download Ollama model"
203
+ fi
204
+ echo ""
205
+ /bin/bash
206
+ fi
env.sample.txt ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ============================================
2
+ # AllyCAT GraphRAG Configuration
3
+ # ============================================
4
+ # This file contains all configuration options for AllyCAT GraphRAG.
5
+ # Copy this file to .env and customize the values.
6
+
7
+ # ============================================
8
+ # Deployment Mode
9
+ # ============================================
10
+ # Automatically run the complete pipeline on startup (Docker deployments)
11
+ # Set to true for Heroku, AWS, Google Cloud Run, etc.
12
+ AUTO_RUN_PIPELINE=false
13
+
14
+ # Memory Optimization: Remove pipeline dependencies after completion
15
+ # Saves ~350-500 MB RAM - recommended for 1GB containers (DigitalOcean, etc.)
16
+ # Set to true to automatically clean up heavy packages after pipeline completes
17
+ CLEANUP_PIPELINE_DEPS=false
18
+
19
+ # ============================================
20
+ # Website Crawling Configuration
21
+ # ============================================
22
+ # Website to crawl (required if AUTO_RUN_PIPELINE=true)
23
+ WEBSITE_URL=https://example.com
24
+ CRAWL_MAX_DOWNLOADS=100
25
+ CRAWL_MAX_DEPTH=3
26
+ WAITTIME_BETWEEN_REQUESTS=0.1
27
+
28
+ # ============================================
29
+ # LLM Configuration (Cloud-First)
30
+ # ============================================
31
+ # LLM Runtime Environment
32
+ # Options: cloud, local_ollama
33
+ LLM_RUN_ENV=cloud
34
+
35
+ # LLM Model Selection
36
+ # Cloud providers: cerebras/llama3.1-8b, gemini/gemini-1.5-flash, nebius/meta-llama/Meta-Llama-3.1-8B-Instruct
37
+ # Local: ollama/gemma3:1b
38
+ LLM_MODEL=cerebras/llama3.1-8b
39
+
40
+ # ============================================
41
+ # LLM API Keys (Cloud Providers)
42
+ # ============================================
43
+ # Get your FREE API keys:
44
+ # - Cerebras: https://cerebras.ai/ (recommended)
45
+ # - Gemini: https://aistudio.google.com/
46
+ # - Nebius: https://studio.nebius.ai/
47
+
48
+ CEREBRAS_API_KEY=your_cerebras_api_key
49
+ GEMINI_API_KEY=your_gemini_api_key
50
+ NEBIUS_API_KEY=your_nebius_api_key
51
+
52
+ # ============================================
53
+ # Local Ollama Configuration (Optional)
54
+ # ============================================
55
+ # Only needed if LLM_RUN_ENV=local_ollama
56
+ # OLLAMA_MODEL=gemma3:1b
57
+ # OLLAMA_BASE_URL=http://localhost:11434
58
+
59
+ # ============================================
60
+ # Vector Database Configuration
61
+ # ============================================
62
+ # Options: cloud_zilliz (recommended), local
63
+ VECTOR_DB_TYPE=cloud_zilliz
64
+
65
+ # Zilliz Cloud Configuration (https://cloud.zilliz.com/)
66
+ ZILLIZ_CLUSTER_ENDPOINT=https://your-cluster.zilliz.cloud
67
+ ZILLIZ_TOKEN=your_zilliz_token
68
+
69
+ # Local Milvus Configuration (only if VECTOR_DB_TYPE=local)
70
+ # MILVUS_URI=./workspace/milvus_lite.db
71
+
72
+ # ============================================
73
+ # Graph Database Configuration (Neo4j)
74
+ # ============================================
75
+ # Neo4j Aura (Cloud) - Recommended: https://neo4j.com/cloud/aura/
76
+ NEO4J_URI=neo4j+s://your-instance.databases.neo4j.io
77
+ NEO4J_USERNAME=neo4j
78
+ NEO4J_PASSWORD=your_neo4j_password
79
+ NEO4J_DATABASE=neo4j
80
+
81
+ # Local Neo4j (only for development)
82
+ # NEO4J_URI=bolt://localhost:7687
83
+
84
+ # ============================================
85
+ # Graph Extraction LLM Provider
86
+ # ============================================
87
+ # Provider for entity/relationship extraction
88
+ # Options: gemini (recommended, 1500 free requests/day), cerebras
89
+ GRAPH_LLM_PROVIDER=gemini
90
+
91
+ # API keys are shared from LLM Configuration section above
92
+
93
+ # ============================================
94
+ # Embedding Model Configuration
95
+ # ============================================
96
+ # Embedding model for semantic search
97
+ # Options:
98
+ # - ibm-granite/granite-embedding-30m-english (61 MB, fastest)
99
+ # - BAAI/bge-small-en-v1.5 (129 MB, balanced)
100
+ # - ibm-granite/granite-embedding-107m-multilingual (219 MB, multilingual)
101
+ EMBEDDING_MODEL=ibm-granite/granite-embedding-30m-english
102
+ EMBEDDING_LENGTH=384
103
+
104
+ # ============================================
105
+ # Chunking Configuration
106
+ # ============================================
107
+ CHUNK_SIZE=512
108
+ CHUNK_OVERLAP=20
109
+
110
+ # ============================================
111
+ # Graph Extraction Configuration
112
+ # ============================================
113
+ # Entity and relationship extraction parameters
114
+ GRAPH_MIN_ENTITIES=5
115
+ GRAPH_MAX_ENTITIES=15
116
+ GRAPH_MIN_RELATIONSHIPS=3
117
+ GRAPH_MAX_RELATIONSHIPS=8
118
+ GRAPH_MIN_CONFIDENCE=0.8
119
+ GRAPH_MAX_CONTENT_CHARS=12000
120
+ GRAPH_SENTENCE_BOUNDARY_RATIO=0.7
121
+
122
+ # ============================================
123
+ # Graph Community Detection (Phase 2)
124
+ # ============================================
125
+ # Leiden algorithm parameters for community detection
126
+ GRAPH_MIN_COMMUNITY_SIZE=5
127
+ GRAPH_LEIDEN_RESOLUTION=1.0
128
+ GRAPH_LEIDEN_ITERATIONS=-1
129
+ GRAPH_LEIDEN_SEED=42
130
+ GRAPH_TARGET_COVERAGE_MIN=5.0
131
+ GRAPH_TARGET_COVERAGE_MAX=8.0
132
+ GRAPH_RESOLUTION_CANDIDATES=0.1,0.5,1.0,2.0,5.0,10.0,20.0,30.0,50.0,100.0
133
+ GRAPH_MIN_NODES_FOR_OPTIMIZATION=50
134
+
135
+ # ============================================
136
+ # Application Configuration
137
+ # ============================================
138
+ # Application type for Docker deployment
139
+ # Options: flask_graph (default), chainlit_graph, flask
140
+ APP_TYPE=flask_graph
141
+
142
+ # Flask server port
143
+ PORT=8080
144
+
145
+ # UI starter prompts (pipe-separated)
146
+ UI_STARTER_PROMPTS=What is this website? | What are upcoming events? | Who are some of the partners?
147
+
148
+ # ============================================
149
+ # Port Configuration
150
+ # ============================================
151
+ # Flask apps (Vector RAG vs GraphRAG) - Auto-configured via MY_CONFIG
152
+ FLASK_VECTOR_PORT=8081 # app_flask.py (vector-only RAG)
153
+ FLASK_GRAPH_PORT=8080 # app_flask_graph.py (GraphRAG)
154
+
155
+ # Chainlit apps (interactive UI) - Default port: 8000, custom ports for Docker
156
+ CHAINLIT_VECTOR_PORT=8082 # app_chainlit.py (Docker only; native Python uses 8000)
157
+ CHAINLIT_GRAPH_PORT=8083 # app_chainlit_graph.py (Docker only; native Python uses 8000)
158
+
159
+ # Docker and external services
160
+ DOCKER_PORT=8080 # External Docker exposed port (host side)
161
+ DOCKER_APP_PORT=8080 # Internal container port (container side, matches APP_TYPE)
162
+ OLLAMA_PORT=11434 # Ollama server port (for local LLM)
163
+
164
+ # ============================================
165
+ # Workspace Configuration
166
+ # ============================================
167
+ # For native execution: use relative path 'workspace'
168
+ # For Docker: use absolute path '/allycat/workspace'
169
+ WORKSPACE_DIR=workspace
170
+
171
+ # ============================================
172
+ # Advanced Configuration
173
+ # ============================================
174
+ # Hugging Face endpoint (for Chinese users or custom mirrors)
175
+ HF_ENDPOINT=https://huggingface.co
176
+
177
+
file_utils.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ from humanfriendly import format_size
4
+ import pandas as pd
5
+ import glob
6
+ from urllib.parse import unquote
7
+
8
+
9
+ ## Reads parquet files in a folder into a pandas dataframe
10
+ def read_parquet_files_as_df (parquet_dir):
11
+ parquet_files = glob.glob(f'{parquet_dir}/*.parquet')
12
+
13
+ # read each parquet file into a DataFrame and store in a list
14
+ dfs = [pd.read_parquet (f) for f in parquet_files]
15
+
16
+ # Concatenate all DataFrames into a single DataFrame
17
+ data_df = pd.concat(dfs, ignore_index=True)
18
+ return data_df
19
+
20
+
21
+ def download_file(url, local_file, chunk_size=1024*1024):
22
+ """
23
+ Downloads a remote URL to a local file.
24
+
25
+ Args:
26
+ url (str): The remote URL.
27
+ local_filename (str): The name of the local file to save the downloaded content.
28
+ chunk_size (int): The size in bytes of each chunk. Defaults to 1024.
29
+
30
+ Returns:
31
+ None
32
+
33
+ Example usage:
34
+ download_file('http://example.com/file.txt', 'file.txt', chunk_size=1024*1024) # Download in chunks of 1MB
35
+ """
36
+ # Check if the local file already exists
37
+ if os.path.exists(local_file):
38
+ file_size = format_size(os.path.getsize(local_file))
39
+ print(f"Local file '{local_file}' ({file_size}) already exists. Skipping download.")
40
+ return
41
+
42
+ # Create the directory if it doesn't exist
43
+ os.makedirs(os.path.dirname(local_file), exist_ok=True)
44
+
45
+ # Stream the file download
46
+ with requests.get(url, stream=True) as r:
47
+ r.raise_for_status()
48
+ with open(local_file, 'wb') as f:
49
+ for chunk in r.iter_content(chunk_size=chunk_size):
50
+ if chunk: # filter out keep-alive new chunks
51
+ f.write(chunk)
52
+ print()
53
+ file_size = format_size(os.path.getsize(local_file))
54
+ print(f"{local_file} ({file_size}) downloaded successfully.")
55
+ ## --- end: download_file ------
56
+
litellm_patch.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ LiteLLM Async Task Cleanup Patch
3
+
4
+ This module patches LiteLLM's asynchronous logging to ensure all tasks complete properly
5
+ and prevent "Task was destroyed but it is pending!" errors.
6
+ """
7
+
8
+ import asyncio
9
+ import logging
10
+ import functools
11
+ import inspect
12
+ import sys
13
+ from typing import Any, Callable, Coroutine
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ # Global registry of pending LiteLLM async tasks
18
+ _pending_tasks = set()
19
+
20
+ def _patch_litellm_async_logging():
21
+ """
22
+ Patches LiteLLM async logging functions to ensure proper task cleanup.
23
+ Prevents "Task was destroyed but it is pending!" errors.
24
+ """
25
+ try:
26
+ # Try to import LiteLLM modules
27
+ import litellm
28
+ from litellm.utils import _client_async_logging_helper
29
+
30
+ # Store original function
31
+ original_client_async_logging = _client_async_logging_helper
32
+
33
+ # Create patched version with error handling
34
+ @functools.wraps(original_client_async_logging)
35
+ async def patched_client_async_logging_helper(*args, **kwargs):
36
+ try:
37
+ return await original_client_async_logging(*args, **kwargs)
38
+ except Exception as e:
39
+ logger.warning(f"LiteLLM async logging error (handled): {e}")
40
+ return None
41
+
42
+ # Apply patch
43
+ litellm.utils._client_async_logging_helper = patched_client_async_logging_helper
44
+
45
+ # Patch Logging class async_success_handler if available
46
+ if hasattr(litellm, 'litellm_core_utils') and hasattr(litellm.litellm_core_utils, 'litellm_logging'):
47
+ from litellm.litellm_core_utils.litellm_logging import Logging
48
+
49
+ if hasattr(Logging, 'async_success_handler'):
50
+ original_async_success_handler = Logging.async_success_handler
51
+
52
+ @functools.wraps(original_async_success_handler)
53
+ async def patched_async_success_handler(*args, **kwargs):
54
+ try:
55
+ return await original_async_success_handler(*args, **kwargs)
56
+ except Exception as e:
57
+ logger.warning(f"LiteLLM async_success_handler error (handled): {e}")
58
+ return None
59
+
60
+ Logging.async_success_handler = patched_async_success_handler
61
+
62
+ logger.info("Successfully patched LiteLLM async logging functions")
63
+ return True
64
+
65
+ except ImportError:
66
+ logger.warning("Could not find LiteLLM modules to patch")
67
+ return False
68
+ except Exception as e:
69
+ logger.error(f"Error patching LiteLLM: {e}")
70
+ return False
71
+
72
+
73
+ def create_task_with_cleanup(coro: Coroutine) -> asyncio.Task:
74
+ """
75
+ Creates an asyncio task with automatic cleanup registration.
76
+ Prevents orphaned tasks and associated warnings.
77
+ """
78
+ task = asyncio.create_task(coro)
79
+ _pending_tasks.add(task)
80
+ task.add_done_callback(_pending_tasks.discard)
81
+ return task
82
+
83
+
84
+ async def cleanup_all_async_tasks(timeout: float = 2.0):
85
+ """
86
+ Waits for pending async tasks to complete within timeout period.
87
+ Should be called before exiting async contexts to prevent warnings.
88
+ """
89
+ if not _pending_tasks:
90
+ return
91
+
92
+ logger.debug(f"Cleaning up {len(_pending_tasks)} pending async tasks...")
93
+ try:
94
+ # Wait for all pending tasks with a timeout
95
+ done, pending = await asyncio.wait(
96
+ _pending_tasks, timeout=timeout, return_when=asyncio.ALL_COMPLETED
97
+ )
98
+
99
+ if pending:
100
+ logger.warning(f"{len(pending)} async tasks still pending after timeout")
101
+ except Exception as e:
102
+ logger.error(f"Error during async task cleanup: {e}")
103
+
104
+
105
+ # Apply the patch when this module is imported
106
+ _patch_litellm_async_logging()
my_config.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+
4
+ # Load environment variables from .env file
5
+ load_dotenv()
6
+
7
+ ## Configuration
8
+
9
+ class MyConfig:
10
+ pass
11
+
12
+ MY_CONFIG = MyConfig ()
13
+
14
+ ## All of these settings can be overridden by .env file
15
+ ## And it will be loaded automatically by load_dotenv()
16
+ ## And they will take precedence over the default values below
17
+ ## See sample .env file 'env.sample.txt' for reference
18
+
19
+ ## HuggingFace config
20
+ MY_CONFIG.HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://thealliance.ai/")
21
+
22
+ ## Crawl settings
23
+ MY_CONFIG.WEBSITE_URL = os.getenv("WEBSITE_URL", "")
24
+ MY_CONFIG.CRAWL_MAX_DOWNLOADS = int(os.getenv("CRAWL_MAX_DOWNLOADS", 100))
25
+ MY_CONFIG.CRAWL_MAX_DEPTH = int(os.getenv("CRAWL_MAX_DEPTH", 3))
26
+ MY_CONFIG.WAITTIME_BETWEEN_REQUESTS = float(os.getenv("WAITTIME_BETWEEN_REQUESTS", 0.1)) # in seconds
27
+ MY_CONFIG.CRAWL_MIME_TYPE = 'text/html'
28
+
29
+
30
+ ## Directories
31
+ MY_CONFIG.WORKSPACE_DIR = os.path.join(os.getenv('WORKSPACE_DIR', 'workspace'))
32
+ MY_CONFIG.CRAWL_DIR = os.path.join( MY_CONFIG.WORKSPACE_DIR, "crawled")
33
+ MY_CONFIG.PROCESSED_DATA_DIR = os.path.join( MY_CONFIG.WORKSPACE_DIR, "processed")
34
+
35
+ ## llama index will download the models to this directory
36
+ os.environ["LLAMA_INDEX_CACHE_DIR"] = os.path.join(MY_CONFIG.WORKSPACE_DIR, "llama_index_cache")
37
+ ### -------------------------------
38
+
39
+ # Find embedding models: https://huggingface.co/spaces/mteb/leaderboard
40
+
41
+ MY_CONFIG.EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", 'ibm-granite/granite-embedding-30m-english')
42
+ MY_CONFIG.EMBEDDING_LENGTH = int(os.getenv("EMBEDDING_LENGTH", 384))
43
+
44
+ ## Chunking
45
+ MY_CONFIG.CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", 512))
46
+ MY_CONFIG.CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", 20))
47
+
48
+
49
+ ### Milvus config
50
+ MY_CONFIG.COLLECTION_NAME = 'pages'
51
+
52
+ # Separate Milvus databases for different RAG approaches
53
+ # This allows running Vector RAG and Hybrid GraphRAG simultaneously without conflicts
54
+ MY_CONFIG.MILVUS_URI_VECTOR = os.path.join( MY_CONFIG.WORKSPACE_DIR, 'vector_only_milvus.db') # Vector RAG only
55
+ MY_CONFIG.MILVUS_URI_HYBRID_GRAPH = os.path.join( MY_CONFIG.WORKSPACE_DIR, 'hybrid_graph_milvus.db') # Hybrid GraphRAG
56
+
57
+ # Vector Database Configuration
58
+ MY_CONFIG.VECTOR_DB_TYPE = os.getenv("VECTOR_DB_TYPE", "cloud_zilliz") # Options: "local" or "cloud_zilliz"
59
+
60
+ # Zilliz Cloud Configuration (for cloud deployment)
61
+ MY_CONFIG.ZILLIZ_CLUSTER_ENDPOINT = os.getenv("ZILLIZ_CLUSTER_ENDPOINT")
62
+ MY_CONFIG.ZILLIZ_TOKEN = os.getenv("ZILLIZ_TOKEN")
63
+
64
+
65
+
66
+ ## ---- LLM settings ----
67
+ ## Choose one: We can do local or cloud LLMs
68
+ # LLM_RUN_ENV controls which LLM backend to use: 'local_ollama' for local Ollama, 'cloud' for cloud LLMs
69
+ # Set LLM_RUN_ENV in your .env file. Default is 'cloud' for production deployment.
70
+ ## Local LLMs are run on your machine using Ollama
71
+ ## Cloud LLMs are run on any LiteLLM supported service like Replicate / Nebius / Cerebras / etc
72
+ ## For running Ollama locally, please check the instructions in the docs/llm-local.md file
73
+
74
+
75
+ MY_CONFIG.LLM_RUN_ENV = os.getenv("LLM_RUN_ENV", "cloud")
76
+
77
+
78
+ MY_CONFIG.LLM_MODEL = os.getenv("LLM_MODEL", 'cerebras/llama3.1-8b')
79
+
80
+ # Replicate API token (if using Replicate)
81
+ MY_CONFIG.REPLICATE_API_TOKEN = os.getenv("REPLICATE_API_TOKEN", None)
82
+ # Nebius API key (if using Nebius)
83
+ MY_CONFIG.NEBIUS_API_KEY = os.getenv("NEBIUS_API_KEY", None)
84
+
85
+ # --- GraphBuilder LLM API keys ---
86
+ MY_CONFIG.CEREBRAS_API_KEY = os.getenv("CEREBRAS_API_KEY", None)
87
+ MY_CONFIG.GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", None)
88
+
89
+ # --- Graph entity/relationship extraction config ---
90
+ MY_CONFIG.GRAPH_MIN_ENTITIES = int(os.getenv("GRAPH_MIN_ENTITIES", 5))
91
+ MY_CONFIG.GRAPH_MAX_ENTITIES = int(os.getenv("GRAPH_MAX_ENTITIES", 15))
92
+ MY_CONFIG.GRAPH_MIN_RELATIONSHIPS = int(os.getenv("GRAPH_MIN_RELATIONSHIPS", 3))
93
+ MY_CONFIG.GRAPH_MAX_RELATIONSHIPS = int(os.getenv("GRAPH_MAX_RELATIONSHIPS", 8))
94
+ MY_CONFIG.GRAPH_MIN_CONFIDENCE = float(os.getenv("GRAPH_MIN_CONFIDENCE", 0.8))
95
+ MY_CONFIG.GRAPH_MAX_CONTENT_CHARS = int(os.getenv("GRAPH_MAX_CONTENT_CHARS", 12000))
96
+ MY_CONFIG.GRAPH_SENTENCE_BOUNDARY_RATIO = float(os.getenv("GRAPH_SENTENCE_BOUNDARY_RATIO", 0.7))
97
+
98
+
99
+
100
+ ## --- GraphRAG ---
101
+ # --- Neo4j config ---
102
+ MY_CONFIG.NEO4J_URI = os.getenv("NEO4J_URI")
103
+ MY_CONFIG.NEO4J_USER = os.getenv("NEO4J_USERNAME")
104
+ MY_CONFIG.NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
105
+ MY_CONFIG.NEO4J_DATABASE = os.getenv("NEO4J_DATABASE")
106
+ MY_CONFIG.GRAPH_DATA_DIR = os.path.join(MY_CONFIG.WORKSPACE_DIR, "graph_data")
107
+
108
+
109
+
110
+
111
+
112
+ ## --- UI settings ---
113
+ MY_CONFIG.STARTER_PROMPTS_STR = os.getenv("UI_STARTER_PROMPTS", 'What is this website? | What are upcoming events? | Who are some of the partners?')
114
+
115
+
116
+ MY_CONFIG.STARTER_PROMPTS = MY_CONFIG.STARTER_PROMPTS_STR.split("|") if MY_CONFIG.STARTER_PROMPTS_STR else []
117
+
118
+
119
+ ## --- Port Configuration ---
120
+ # Flask apps (auto-configured via MY_CONFIG)
121
+ MY_CONFIG.FLASK_VECTOR_PORT = int(os.getenv("FLASK_VECTOR_PORT", 8081)) # app_flask.py (vector RAG)
122
+ MY_CONFIG.FLASK_GRAPH_PORT = int(os.getenv("FLASK_GRAPH_PORT", 8080)) # app_flask_graph.py (GraphRAG)
123
+
124
+ # Chainlit apps (default port: 8000, custom ports for Docker deployments)
125
+ MY_CONFIG.CHAINLIT_VECTOR_PORT = int(os.getenv("CHAINLIT_VECTOR_PORT", 8082)) # app_chainlit.py (Docker: 8082, Native: 8000)
126
+ MY_CONFIG.CHAINLIT_GRAPH_PORT = int(os.getenv("CHAINLIT_GRAPH_PORT", 8083)) # app_chainlit_graph.py (Docker: 8083, Native: 8000)
127
+
128
+ # Docker and external services
129
+ MY_CONFIG.DOCKER_PORT = int(os.getenv("DOCKER_PORT", 8080)) # External host port (maps to DOCKER_APP_PORT)
130
+ MY_CONFIG.DOCKER_APP_PORT = int(os.getenv("DOCKER_APP_PORT", 8080)) # Internal container port (all apps use this in Docker)
131
+ MY_CONFIG.OLLAMA_PORT = int(os.getenv("OLLAMA_PORT", 11434)) # Ollama server port
news.md ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Allycat News
2
+
3
+ ## 2025-10-17: GraphRAG
4
+
5
+ Niloy Deb Barma from [Vel Tech University](https://www.veltech.edu.in/btech-admissions/) in Chennai is the process of adding GraphRAG to AllyCat.
6
+
7
+ ## 2025-08-23: Expanded Team
8
+
9
+ In addition to Sujee and Dave, we have begun working with Chirag, Nikhil and the team at [Open Governance](https://www.opgov.ai)
10
+
11
+ ## 2025-07-14: Big Update
12
+
13
+ Lot of cool updates:
14
+
15
+ **Robust web crawler** ([#31](https://github.com/The-AI-Alliance/allycat/issues/31))
16
+
17
+ Completely redid web crawler. Now it
18
+ - is more robust and handle scenarios that made previous crawler fail.
19
+ - can handle multiple file types (not just text/html) correctly
20
+ - Handle anchor tags (`a.html#news`) in HTML files correctly
21
+ - pauses (customizable) between requests so to not hammer the webserver.
22
+
23
+ **using [LiteLLM](https://docs.litellm.ai/docs/) for LLM inference** ([#34](https://github.com/The-AI-Alliance/allycat/issues/34))
24
+
25
+ This allows us to seamlessly access LLMs running locally (using [ollama](https://ollama.com/)) or calling inference providers like Nebius, Replicate ..etc.
26
+
27
+ Also singinficantly simplified LLM configuration.
28
+
29
+ **Expanded support for many file types (pdf, docx)** ([#37](https://github.com/The-AI-Alliance/allycat/issues/37))
30
+
31
+ Before we just handled HTML files. Now we can download and process other popular file types - like PDF, DOCX ..etc. We use [Docling](https://github.com/docling-project/docling) for processing files.
32
+
33
+
34
+ **Added [uv](https://docs.astral.sh/uv/) package manager support** ([#26](https://github.com/The-AI-Alliance/allycat/issues/26))
35
+
36
+ UV will be the preferred package manager going forward. We will still maintain `requirements.txt` to support other package managers.
37
+
38
+
39
+ **Better config management**([#19](https://github.com/The-AI-Alliance/allycat/issues/19))
40
+
41
+ Lot of user configuration can be set using `.env` file. This simplifies config management and allows for easier and faster experimentation without changing code.
42
+
43
+
44
+ **Documentation update**
45
+
46
+ Various doc updates.
47
+
48
+ **Huge thanks to all the contributors**
49
+
50
+ - [Steven Pousty](https://github.com/thesteve0) ([linkedin](https://www.linkedin.com/in/thesteve0/))
51
+ - [Santosh Borse](https://github.com/santoshborse)
package-lock.json ADDED
@@ -0,0 +1,523 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "allycat",
3
+ "lockfileVersion": 3,
4
+ "requires": true,
5
+ "packages": {
6
+ "": {
7
+ "dependencies": {
8
+ "dotenv": "^17.2.1",
9
+ "neo4j": "^2.0.0-RC2"
10
+ }
11
+ },
12
+ "node_modules/ajv": {
13
+ "version": "6.12.6",
14
+ "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.12.6.tgz",
15
+ "integrity": "sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g==",
16
+ "license": "MIT",
17
+ "dependencies": {
18
+ "fast-deep-equal": "^3.1.1",
19
+ "fast-json-stable-stringify": "^2.0.0",
20
+ "json-schema-traverse": "^0.4.1",
21
+ "uri-js": "^4.2.2"
22
+ },
23
+ "funding": {
24
+ "type": "github",
25
+ "url": "https://github.com/sponsors/epoberezkin"
26
+ }
27
+ },
28
+ "node_modules/asn1": {
29
+ "version": "0.2.6",
30
+ "resolved": "https://registry.npmjs.org/asn1/-/asn1-0.2.6.tgz",
31
+ "integrity": "sha512-ix/FxPn0MDjeyJ7i/yoHGFt/EX6LyNbxSEhPPXODPL+KB0VPk86UYfL0lMdy+KCnv+fmvIzySwaK5COwqVbWTQ==",
32
+ "license": "MIT",
33
+ "dependencies": {
34
+ "safer-buffer": "~2.1.0"
35
+ }
36
+ },
37
+ "node_modules/assert-plus": {
38
+ "version": "1.0.0",
39
+ "resolved": "https://registry.npmjs.org/assert-plus/-/assert-plus-1.0.0.tgz",
40
+ "integrity": "sha512-NfJ4UzBCcQGLDlQq7nHxH+tv3kyZ0hHQqF5BO6J7tNJeP5do1llPr8dZ8zHonfhAu0PHAdMkSo+8o0wxg9lZWw==",
41
+ "license": "MIT",
42
+ "engines": {
43
+ "node": ">=0.8"
44
+ }
45
+ },
46
+ "node_modules/asynckit": {
47
+ "version": "0.4.0",
48
+ "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz",
49
+ "integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==",
50
+ "license": "MIT"
51
+ },
52
+ "node_modules/aws-sign2": {
53
+ "version": "0.7.0",
54
+ "resolved": "https://registry.npmjs.org/aws-sign2/-/aws-sign2-0.7.0.tgz",
55
+ "integrity": "sha512-08kcGqnYf/YmjoRhfxyu+CLxBjUtHLXLXX/vUfx9l2LYzG3c1m61nrpyFUZI6zeS+Li/wWMMidD9KgrqtGq3mA==",
56
+ "license": "Apache-2.0",
57
+ "engines": {
58
+ "node": "*"
59
+ }
60
+ },
61
+ "node_modules/aws4": {
62
+ "version": "1.13.2",
63
+ "resolved": "https://registry.npmjs.org/aws4/-/aws4-1.13.2.tgz",
64
+ "integrity": "sha512-lHe62zvbTB5eEABUVi/AwVh0ZKY9rMMDhmm+eeyuuUQbQ3+J+fONVQOZyj+DdrvD4BY33uYniyRJ4UJIaSKAfw==",
65
+ "license": "MIT"
66
+ },
67
+ "node_modules/bcrypt-pbkdf": {
68
+ "version": "1.0.2",
69
+ "resolved": "https://registry.npmjs.org/bcrypt-pbkdf/-/bcrypt-pbkdf-1.0.2.tgz",
70
+ "integrity": "sha512-qeFIXtP4MSoi6NLqO12WfqARWWuCKi2Rn/9hJLEmtB5yTNr9DqFWkJRCf2qShWzPeAMRnOgCrq0sg/KLv5ES9w==",
71
+ "license": "BSD-3-Clause",
72
+ "dependencies": {
73
+ "tweetnacl": "^0.14.3"
74
+ }
75
+ },
76
+ "node_modules/caseless": {
77
+ "version": "0.12.0",
78
+ "resolved": "https://registry.npmjs.org/caseless/-/caseless-0.12.0.tgz",
79
+ "integrity": "sha512-4tYFyifaFfGacoiObjJegolkwSU4xQNGbVgUiNYVUxbQ2x2lUsFvY4hVgVzGiIe6WLOPqycWXA40l+PWsxthUw==",
80
+ "license": "Apache-2.0"
81
+ },
82
+ "node_modules/combined-stream": {
83
+ "version": "1.0.8",
84
+ "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz",
85
+ "integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==",
86
+ "license": "MIT",
87
+ "dependencies": {
88
+ "delayed-stream": "~1.0.0"
89
+ },
90
+ "engines": {
91
+ "node": ">= 0.8"
92
+ }
93
+ },
94
+ "node_modules/core-util-is": {
95
+ "version": "1.0.2",
96
+ "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.2.tgz",
97
+ "integrity": "sha512-3lqz5YjWTYnW6dlDa5TLaTCcShfar1e40rmcJVwCBJC6mWlFuj0eCHIElmG1g5kyuJ/GD+8Wn4FFCcz4gJPfaQ==",
98
+ "license": "MIT"
99
+ },
100
+ "node_modules/dashdash": {
101
+ "version": "1.14.1",
102
+ "resolved": "https://registry.npmjs.org/dashdash/-/dashdash-1.14.1.tgz",
103
+ "integrity": "sha512-jRFi8UDGo6j+odZiEpjazZaWqEal3w/basFjQHQEwVtZJGDpxbH1MeYluwCS8Xq5wmLJooDlMgvVarmWfGM44g==",
104
+ "license": "MIT",
105
+ "dependencies": {
106
+ "assert-plus": "^1.0.0"
107
+ },
108
+ "engines": {
109
+ "node": ">=0.10"
110
+ }
111
+ },
112
+ "node_modules/delayed-stream": {
113
+ "version": "1.0.0",
114
+ "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz",
115
+ "integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==",
116
+ "license": "MIT",
117
+ "engines": {
118
+ "node": ">=0.4.0"
119
+ }
120
+ },
121
+ "node_modules/dotenv": {
122
+ "version": "17.2.1",
123
+ "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-17.2.1.tgz",
124
+ "integrity": "sha512-kQhDYKZecqnM0fCnzI5eIv5L4cAe/iRI+HqMbO/hbRdTAeXDG+M9FjipUxNfbARuEg4iHIbhnhs78BCHNbSxEQ==",
125
+ "license": "BSD-2-Clause",
126
+ "engines": {
127
+ "node": ">=12"
128
+ },
129
+ "funding": {
130
+ "url": "https://dotenvx.com"
131
+ }
132
+ },
133
+ "node_modules/ecc-jsbn": {
134
+ "version": "0.1.2",
135
+ "resolved": "https://registry.npmjs.org/ecc-jsbn/-/ecc-jsbn-0.1.2.tgz",
136
+ "integrity": "sha512-eh9O+hwRHNbG4BLTjEl3nw044CkGm5X6LoaCf7LPp7UU8Qrt47JYNi6nPX8xjW97TKGKm1ouctg0QSpZe9qrnw==",
137
+ "license": "MIT",
138
+ "dependencies": {
139
+ "jsbn": "~0.1.0",
140
+ "safer-buffer": "^2.1.0"
141
+ }
142
+ },
143
+ "node_modules/extend": {
144
+ "version": "3.0.2",
145
+ "resolved": "https://registry.npmjs.org/extend/-/extend-3.0.2.tgz",
146
+ "integrity": "sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==",
147
+ "license": "MIT"
148
+ },
149
+ "node_modules/extsprintf": {
150
+ "version": "1.3.0",
151
+ "resolved": "https://registry.npmjs.org/extsprintf/-/extsprintf-1.3.0.tgz",
152
+ "integrity": "sha512-11Ndz7Nv+mvAC1j0ktTa7fAb0vLyGGX+rMHNBYQviQDGU0Hw7lhctJANqbPhu9nV9/izT/IntTgZ7Im/9LJs9g==",
153
+ "engines": [
154
+ "node >=0.6.0"
155
+ ],
156
+ "license": "MIT"
157
+ },
158
+ "node_modules/fast-deep-equal": {
159
+ "version": "3.1.3",
160
+ "resolved": "https://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-3.1.3.tgz",
161
+ "integrity": "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q==",
162
+ "license": "MIT"
163
+ },
164
+ "node_modules/fast-json-stable-stringify": {
165
+ "version": "2.1.0",
166
+ "resolved": "https://registry.npmjs.org/fast-json-stable-stringify/-/fast-json-stable-stringify-2.1.0.tgz",
167
+ "integrity": "sha512-lhd/wF+Lk98HZoTCtlVraHtfh5XYijIjalXck7saUtuanSDyLMxnHhSXEDJqHxD7msR8D0uCmqlkwjCV8xvwHw==",
168
+ "license": "MIT"
169
+ },
170
+ "node_modules/forever-agent": {
171
+ "version": "0.6.1",
172
+ "resolved": "https://registry.npmjs.org/forever-agent/-/forever-agent-0.6.1.tgz",
173
+ "integrity": "sha512-j0KLYPhm6zeac4lz3oJ3o65qvgQCcPubiyotZrXqEaG4hNagNYO8qdlUrX5vwqv9ohqeT/Z3j6+yW067yWWdUw==",
174
+ "license": "Apache-2.0",
175
+ "engines": {
176
+ "node": "*"
177
+ }
178
+ },
179
+ "node_modules/form-data": {
180
+ "version": "2.3.3",
181
+ "resolved": "https://registry.npmjs.org/form-data/-/form-data-2.3.3.tgz",
182
+ "integrity": "sha512-1lLKB2Mu3aGP1Q/2eCOx0fNbRMe7XdwktwOruhfqqd0rIJWwN4Dh+E3hrPSlDCXnSR7UtZ1N38rVXm+6+MEhJQ==",
183
+ "license": "MIT",
184
+ "dependencies": {
185
+ "asynckit": "^0.4.0",
186
+ "combined-stream": "^1.0.6",
187
+ "mime-types": "^2.1.12"
188
+ },
189
+ "engines": {
190
+ "node": ">= 0.12"
191
+ }
192
+ },
193
+ "node_modules/getpass": {
194
+ "version": "0.1.7",
195
+ "resolved": "https://registry.npmjs.org/getpass/-/getpass-0.1.7.tgz",
196
+ "integrity": "sha512-0fzj9JxOLfJ+XGLhR8ze3unN0KZCgZwiSSDz168VERjK8Wl8kVSdcu2kspd4s4wtAa1y/qrVRiAA0WclVsu0ng==",
197
+ "license": "MIT",
198
+ "dependencies": {
199
+ "assert-plus": "^1.0.0"
200
+ }
201
+ },
202
+ "node_modules/har-schema": {
203
+ "version": "2.0.0",
204
+ "resolved": "https://registry.npmjs.org/har-schema/-/har-schema-2.0.0.tgz",
205
+ "integrity": "sha512-Oqluz6zhGX8cyRaTQlFMPw80bSJVG2x/cFb8ZPhUILGgHka9SsokCCOQgpveePerqidZOrT14ipqfJb7ILcW5Q==",
206
+ "license": "ISC",
207
+ "engines": {
208
+ "node": ">=4"
209
+ }
210
+ },
211
+ "node_modules/har-validator": {
212
+ "version": "5.1.5",
213
+ "resolved": "https://registry.npmjs.org/har-validator/-/har-validator-5.1.5.tgz",
214
+ "integrity": "sha512-nmT2T0lljbxdQZfspsno9hgrG3Uir6Ks5afism62poxqBM6sDnMEuPmzTq8XN0OEwqKLLdh1jQI3qyE66Nzb3w==",
215
+ "deprecated": "this library is no longer supported",
216
+ "license": "MIT",
217
+ "dependencies": {
218
+ "ajv": "^6.12.3",
219
+ "har-schema": "^2.0.0"
220
+ },
221
+ "engines": {
222
+ "node": ">=6"
223
+ }
224
+ },
225
+ "node_modules/http-signature": {
226
+ "version": "1.2.0",
227
+ "resolved": "https://registry.npmjs.org/http-signature/-/http-signature-1.2.0.tgz",
228
+ "integrity": "sha512-CAbnr6Rz4CYQkLYUtSNXxQPUH2gK8f3iWexVlsnMeD+GjlsQ0Xsy1cOX+mN3dtxYomRy21CiOzU8Uhw6OwncEQ==",
229
+ "license": "MIT",
230
+ "dependencies": {
231
+ "assert-plus": "^1.0.0",
232
+ "jsprim": "^1.2.2",
233
+ "sshpk": "^1.7.0"
234
+ },
235
+ "engines": {
236
+ "node": ">=0.8",
237
+ "npm": ">=1.3.7"
238
+ }
239
+ },
240
+ "node_modules/is-typedarray": {
241
+ "version": "1.0.0",
242
+ "resolved": "https://registry.npmjs.org/is-typedarray/-/is-typedarray-1.0.0.tgz",
243
+ "integrity": "sha512-cyA56iCMHAh5CdzjJIa4aohJyeO1YbwLi3Jc35MmRU6poroFjIGZzUzupGiRPOjgHg9TLu43xbpwXk523fMxKA==",
244
+ "license": "MIT"
245
+ },
246
+ "node_modules/isstream": {
247
+ "version": "0.1.2",
248
+ "resolved": "https://registry.npmjs.org/isstream/-/isstream-0.1.2.tgz",
249
+ "integrity": "sha512-Yljz7ffyPbrLpLngrMtZ7NduUgVvi6wG9RJ9IUcyCd59YQ911PBJphODUcbOVbqYfxe1wuYf/LJ8PauMRwsM/g==",
250
+ "license": "MIT"
251
+ },
252
+ "node_modules/jsbn": {
253
+ "version": "0.1.1",
254
+ "resolved": "https://registry.npmjs.org/jsbn/-/jsbn-0.1.1.tgz",
255
+ "integrity": "sha512-UVU9dibq2JcFWxQPA6KCqj5O42VOmAY3zQUfEKxU0KpTGXwNoCjkX1e13eHNvw/xPynt6pU0rZ1htjWTNTSXsg==",
256
+ "license": "MIT"
257
+ },
258
+ "node_modules/json-schema": {
259
+ "version": "0.4.0",
260
+ "resolved": "https://registry.npmjs.org/json-schema/-/json-schema-0.4.0.tgz",
261
+ "integrity": "sha512-es94M3nTIfsEPisRafak+HDLfHXnKBhV3vU5eqPcS3flIWqcxJWgXHXiey3YrpaNsanY5ei1VoYEbOzijuq9BA==",
262
+ "license": "(AFL-2.1 OR BSD-3-Clause)"
263
+ },
264
+ "node_modules/json-schema-traverse": {
265
+ "version": "0.4.1",
266
+ "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz",
267
+ "integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==",
268
+ "license": "MIT"
269
+ },
270
+ "node_modules/json-stringify-safe": {
271
+ "version": "5.0.1",
272
+ "resolved": "https://registry.npmjs.org/json-stringify-safe/-/json-stringify-safe-5.0.1.tgz",
273
+ "integrity": "sha512-ZClg6AaYvamvYEE82d3Iyd3vSSIjQ+odgjaTzRuO3s7toCdFKczob2i0zCh7JE8kWn17yvAWhUVxvqGwUalsRA==",
274
+ "license": "ISC"
275
+ },
276
+ "node_modules/jsprim": {
277
+ "version": "1.4.2",
278
+ "resolved": "https://registry.npmjs.org/jsprim/-/jsprim-1.4.2.tgz",
279
+ "integrity": "sha512-P2bSOMAc/ciLz6DzgjVlGJP9+BrJWu5UDGK70C2iweC5QBIeFf0ZXRvGjEj2uYgrY2MkAAhsSWHDWlFtEroZWw==",
280
+ "license": "MIT",
281
+ "dependencies": {
282
+ "assert-plus": "1.0.0",
283
+ "extsprintf": "1.3.0",
284
+ "json-schema": "0.4.0",
285
+ "verror": "1.10.0"
286
+ },
287
+ "engines": {
288
+ "node": ">=0.6.0"
289
+ }
290
+ },
291
+ "node_modules/mime-db": {
292
+ "version": "1.52.0",
293
+ "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz",
294
+ "integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==",
295
+ "license": "MIT",
296
+ "engines": {
297
+ "node": ">= 0.6"
298
+ }
299
+ },
300
+ "node_modules/mime-types": {
301
+ "version": "2.1.35",
302
+ "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz",
303
+ "integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==",
304
+ "license": "MIT",
305
+ "dependencies": {
306
+ "mime-db": "1.52.0"
307
+ },
308
+ "engines": {
309
+ "node": ">= 0.6"
310
+ }
311
+ },
312
+ "node_modules/neo4j": {
313
+ "version": "2.0.0-RC2",
314
+ "resolved": "https://registry.npmjs.org/neo4j/-/neo4j-2.0.0-RC2.tgz",
315
+ "integrity": "sha512-TTTRwv8t3S0Mp6rVtgY4RNt+SCSl+ccuXhP6DmXERtNp5Vs8LlJ85uZiGJKcT74Xthqc4ihl517+bBOqQJxhNA==",
316
+ "license": "Apache-2.0",
317
+ "dependencies": {
318
+ "request": "^2.27.0",
319
+ "underscore": "1.7.x"
320
+ },
321
+ "engines": {
322
+ "node": ">= 0.10"
323
+ }
324
+ },
325
+ "node_modules/oauth-sign": {
326
+ "version": "0.9.0",
327
+ "resolved": "https://registry.npmjs.org/oauth-sign/-/oauth-sign-0.9.0.tgz",
328
+ "integrity": "sha512-fexhUFFPTGV8ybAtSIGbV6gOkSv8UtRbDBnAyLQw4QPKkgNlsH2ByPGtMUqdWkos6YCRmAqViwgZrJc/mRDzZQ==",
329
+ "license": "Apache-2.0",
330
+ "engines": {
331
+ "node": "*"
332
+ }
333
+ },
334
+ "node_modules/performance-now": {
335
+ "version": "2.1.0",
336
+ "resolved": "https://registry.npmjs.org/performance-now/-/performance-now-2.1.0.tgz",
337
+ "integrity": "sha512-7EAHlyLHI56VEIdK57uwHdHKIaAGbnXPiw0yWbarQZOKaKpvUIgW0jWRVLiatnM+XXlSwsanIBH/hzGMJulMow==",
338
+ "license": "MIT"
339
+ },
340
+ "node_modules/psl": {
341
+ "version": "1.15.0",
342
+ "resolved": "https://registry.npmjs.org/psl/-/psl-1.15.0.tgz",
343
+ "integrity": "sha512-JZd3gMVBAVQkSs6HdNZo9Sdo0LNcQeMNP3CozBJb3JYC/QUYZTnKxP+f8oWRX4rHP5EurWxqAHTSwUCjlNKa1w==",
344
+ "license": "MIT",
345
+ "dependencies": {
346
+ "punycode": "^2.3.1"
347
+ },
348
+ "funding": {
349
+ "url": "https://github.com/sponsors/lupomontero"
350
+ }
351
+ },
352
+ "node_modules/punycode": {
353
+ "version": "2.3.1",
354
+ "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.3.1.tgz",
355
+ "integrity": "sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg==",
356
+ "license": "MIT",
357
+ "engines": {
358
+ "node": ">=6"
359
+ }
360
+ },
361
+ "node_modules/qs": {
362
+ "version": "6.5.3",
363
+ "resolved": "https://registry.npmjs.org/qs/-/qs-6.5.3.tgz",
364
+ "integrity": "sha512-qxXIEh4pCGfHICj1mAJQ2/2XVZkjCDTcEgfoSQxc/fYivUZxTkk7L3bDBJSoNrEzXI17oUO5Dp07ktqE5KzczA==",
365
+ "license": "BSD-3-Clause",
366
+ "engines": {
367
+ "node": ">=0.6"
368
+ }
369
+ },
370
+ "node_modules/request": {
371
+ "version": "2.88.2",
372
+ "resolved": "https://registry.npmjs.org/request/-/request-2.88.2.tgz",
373
+ "integrity": "sha512-MsvtOrfG9ZcrOwAW+Qi+F6HbD0CWXEh9ou77uOb7FM2WPhwT7smM833PzanhJLsgXjN89Ir6V2PczXNnMpwKhw==",
374
+ "deprecated": "request has been deprecated, see https://github.com/request/request/issues/3142",
375
+ "license": "Apache-2.0",
376
+ "dependencies": {
377
+ "aws-sign2": "~0.7.0",
378
+ "aws4": "^1.8.0",
379
+ "caseless": "~0.12.0",
380
+ "combined-stream": "~1.0.6",
381
+ "extend": "~3.0.2",
382
+ "forever-agent": "~0.6.1",
383
+ "form-data": "~2.3.2",
384
+ "har-validator": "~5.1.3",
385
+ "http-signature": "~1.2.0",
386
+ "is-typedarray": "~1.0.0",
387
+ "isstream": "~0.1.2",
388
+ "json-stringify-safe": "~5.0.1",
389
+ "mime-types": "~2.1.19",
390
+ "oauth-sign": "~0.9.0",
391
+ "performance-now": "^2.1.0",
392
+ "qs": "~6.5.2",
393
+ "safe-buffer": "^5.1.2",
394
+ "tough-cookie": "~2.5.0",
395
+ "tunnel-agent": "^0.6.0",
396
+ "uuid": "^3.3.2"
397
+ },
398
+ "engines": {
399
+ "node": ">= 6"
400
+ }
401
+ },
402
+ "node_modules/safe-buffer": {
403
+ "version": "5.2.1",
404
+ "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz",
405
+ "integrity": "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==",
406
+ "funding": [
407
+ {
408
+ "type": "github",
409
+ "url": "https://github.com/sponsors/feross"
410
+ },
411
+ {
412
+ "type": "patreon",
413
+ "url": "https://www.patreon.com/feross"
414
+ },
415
+ {
416
+ "type": "consulting",
417
+ "url": "https://feross.org/support"
418
+ }
419
+ ],
420
+ "license": "MIT"
421
+ },
422
+ "node_modules/safer-buffer": {
423
+ "version": "2.1.2",
424
+ "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz",
425
+ "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==",
426
+ "license": "MIT"
427
+ },
428
+ "node_modules/sshpk": {
429
+ "version": "1.18.0",
430
+ "resolved": "https://registry.npmjs.org/sshpk/-/sshpk-1.18.0.tgz",
431
+ "integrity": "sha512-2p2KJZTSqQ/I3+HX42EpYOa2l3f8Erv8MWKsy2I9uf4wA7yFIkXRffYdsx86y6z4vHtV8u7g+pPlr8/4ouAxsQ==",
432
+ "license": "MIT",
433
+ "dependencies": {
434
+ "asn1": "~0.2.3",
435
+ "assert-plus": "^1.0.0",
436
+ "bcrypt-pbkdf": "^1.0.0",
437
+ "dashdash": "^1.12.0",
438
+ "ecc-jsbn": "~0.1.1",
439
+ "getpass": "^0.1.1",
440
+ "jsbn": "~0.1.0",
441
+ "safer-buffer": "^2.0.2",
442
+ "tweetnacl": "~0.14.0"
443
+ },
444
+ "bin": {
445
+ "sshpk-conv": "bin/sshpk-conv",
446
+ "sshpk-sign": "bin/sshpk-sign",
447
+ "sshpk-verify": "bin/sshpk-verify"
448
+ },
449
+ "engines": {
450
+ "node": ">=0.10.0"
451
+ }
452
+ },
453
+ "node_modules/tough-cookie": {
454
+ "version": "2.5.0",
455
+ "resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-2.5.0.tgz",
456
+ "integrity": "sha512-nlLsUzgm1kfLXSXfRZMc1KLAugd4hqJHDTvc2hDIwS3mZAfMEuMbc03SujMF+GEcpaX/qboeycw6iO8JwVv2+g==",
457
+ "license": "BSD-3-Clause",
458
+ "dependencies": {
459
+ "psl": "^1.1.28",
460
+ "punycode": "^2.1.1"
461
+ },
462
+ "engines": {
463
+ "node": ">=0.8"
464
+ }
465
+ },
466
+ "node_modules/tunnel-agent": {
467
+ "version": "0.6.0",
468
+ "resolved": "https://registry.npmjs.org/tunnel-agent/-/tunnel-agent-0.6.0.tgz",
469
+ "integrity": "sha512-McnNiV1l8RYeY8tBgEpuodCC1mLUdbSN+CYBL7kJsJNInOP8UjDDEwdk6Mw60vdLLrr5NHKZhMAOSrR2NZuQ+w==",
470
+ "license": "Apache-2.0",
471
+ "dependencies": {
472
+ "safe-buffer": "^5.0.1"
473
+ },
474
+ "engines": {
475
+ "node": "*"
476
+ }
477
+ },
478
+ "node_modules/tweetnacl": {
479
+ "version": "0.14.5",
480
+ "resolved": "https://registry.npmjs.org/tweetnacl/-/tweetnacl-0.14.5.tgz",
481
+ "integrity": "sha512-KXXFFdAbFXY4geFIwoyNK+f5Z1b7swfXABfL7HXCmoIWMKU3dmS26672A4EeQtDzLKy7SXmfBu51JolvEKwtGA==",
482
+ "license": "Unlicense"
483
+ },
484
+ "node_modules/underscore": {
485
+ "version": "1.7.0",
486
+ "resolved": "https://registry.npmjs.org/underscore/-/underscore-1.7.0.tgz",
487
+ "integrity": "sha512-cp0oQQyZhUM1kpJDLdGO1jPZHgS/MpzoWYfe9+CM2h/QGDZlqwT2T3YGukuBdaNJ/CAPoeyAZRRHz8JFo176vA=="
488
+ },
489
+ "node_modules/uri-js": {
490
+ "version": "4.4.1",
491
+ "resolved": "https://registry.npmjs.org/uri-js/-/uri-js-4.4.1.tgz",
492
+ "integrity": "sha512-7rKUyy33Q1yc98pQ1DAmLtwX109F7TIfWlW1Ydo8Wl1ii1SeHieeh0HHfPeL2fMXK6z0s8ecKs9frCuLJvndBg==",
493
+ "license": "BSD-2-Clause",
494
+ "dependencies": {
495
+ "punycode": "^2.1.0"
496
+ }
497
+ },
498
+ "node_modules/uuid": {
499
+ "version": "3.4.0",
500
+ "resolved": "https://registry.npmjs.org/uuid/-/uuid-3.4.0.tgz",
501
+ "integrity": "sha512-HjSDRw6gZE5JMggctHBcjVak08+KEVhSIiDzFnT9S9aegmp85S/bReBVTb4QTFaRNptJ9kuYaNhnbNEOkbKb/A==",
502
+ "deprecated": "Please upgrade to version 7 or higher. Older versions may use Math.random() in certain circumstances, which is known to be problematic. See https://v8.dev/blog/math-random for details.",
503
+ "license": "MIT",
504
+ "bin": {
505
+ "uuid": "bin/uuid"
506
+ }
507
+ },
508
+ "node_modules/verror": {
509
+ "version": "1.10.0",
510
+ "resolved": "https://registry.npmjs.org/verror/-/verror-1.10.0.tgz",
511
+ "integrity": "sha512-ZZKSmDAEFOijERBLkmYfJ+vmk3w+7hOLYDNkRCuRuMJGEmqYNCNLyBBFwWKVMhfwaEF3WOd0Zlw86U/WC/+nYw==",
512
+ "engines": [
513
+ "node >=0.6.0"
514
+ ],
515
+ "license": "MIT",
516
+ "dependencies": {
517
+ "assert-plus": "^1.0.0",
518
+ "core-util-is": "1.0.2",
519
+ "extsprintf": "^1.2.0"
520
+ }
521
+ }
522
+ }
523
+ }
package.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "dependencies": {
3
+ "dotenv": "^17.2.1",
4
+ "neo4j": "^2.0.0-RC2"
5
+ }
6
+ }
pyproject.toml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "allycat-1"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.11"
7
+ dependencies = [
8
+ "chainlit>=2.2.1",
9
+ "docling>=2.41.0",
10
+ "flask>=3.1.1",
11
+ "humanfriendly>=10.0",
12
+ "litellm>=1.74.3",
13
+ "llama-index>=0.12.48",
14
+ "llama-index-embeddings-huggingface>=0.5.5",
15
+ "llama-index-llms-litellm>=0.5.1",
16
+ "llama-index-vector-stores-milvus>=0.8.5",
17
+ "milvus-lite>=2.5.1",
18
+ "mimetypes-magic>=0.4.30",
19
+ "nest-asyncio>=1.6.0",
20
+ "pandas>=2.3.1",
21
+ "pymilvus>=2.5.12",
22
+ "tqdm>=4.67.1",
23
+ ]
24
+
25
+ [dependency-groups]
26
+ dev = [
27
+ "ipykernel>=6.29.5",
28
+ ]
query_utils.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def tweak_query(query : str, model : str):
2
+ """
3
+ for qwen3 models, turn off thinking
4
+ """
5
+
6
+ # Check if the model is qwen3
7
+ if 'qwen3' in model:
8
+ # Check if the query contains '/no_think'
9
+ if '/no_think' not in query:
10
+ # Append '/no_think' to the query
11
+ query += '\n/no_think'
12
+ return query
requirements-build.txt ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ============================================
2
+ # BUILD/PIPELINE REQUIREMENTS - Full Pipeline
3
+ # ============================================
4
+ # These packages are ONLY needed for running the pipeline:
5
+ # - Crawling websites (1_crawl_site.py)
6
+ # - Processing files (2_process_files.py)
7
+ # - Graph extraction (2b_process_graph_phase*.py)
8
+ # - Saving to databases (3*.py)
9
+ #
10
+ # After pipeline completes, these can be REMOVED to save ~300-500 MB RAM
11
+
12
+ # ============================================
13
+ # Document Processing - REMOVE AFTER PIPELINE
14
+ # ============================================
15
+ docling # PDF/HTML to markdown conversion (~100 MB)
16
+ html2text # HTML processing (~10 MB)
17
+
18
+ # ============================================
19
+ # Graph Community Detection - REMOVE AFTER PIPELINE
20
+ # ============================================
21
+ igraph # Graph analysis library (~50 MB)
22
+ leidenalg # Leiden algorithm for communities (~30 MB)
23
+ graspologic # Graph statistics (~40 MB)
24
+
25
+ # ============================================
26
+ # Development Tools - REMOVE AFTER PIPELINE
27
+ # ============================================
28
+ milvus-lite==2.4.11 # Local Milvus server (not needed if using cloud Zilliz) (~100 MB)
29
+ tqdm # Progress bars (nice to have, but not essential)
30
+ ipykernel # Jupyter support (only for development)
31
+ fastmcp # MCP support (only for development)
32
+
33
+ # ============================================
34
+ # Total Savings if Removed: ~350-500 MB
35
+ # ============================================
requirements-docker-cloud.txt ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core document processing (lightweight)
2
+ docling
3
+ html2text
4
+
5
+ # Asyncio support
6
+ nest_asyncio
7
+
8
+ # PyTorch CPU-only (much smaller than GPU)
9
+ --extra-index-url https://download.pytorch.org/whl/cpu
10
+ torch==2.6.0+cpu
11
+
12
+ # JSON parsing
13
+ orjson>=3.8.0
14
+ json-repair>=0.7.0
15
+
16
+ # Vector DB - Cloud Zilliz support
17
+ pymilvus==2.5.5
18
+
19
+ # LLM Integration
20
+ litellm
21
+
22
+ # LlamaIndex
23
+ llama-index
24
+ llama-index-embeddings-huggingface
25
+ llama-index-llms-litellm
26
+ llama-index-vector-stores-milvus==0.5.0
27
+
28
+ # Graph Database
29
+ neo4j
30
+ networkx
31
+ python-louvain
32
+ igraph
33
+ leidenalg
34
+ graspologic
35
+
36
+ # Graph LLM APIs
37
+ google-generativeai
38
+ openai>=1.0.0
39
+ fastmcp
40
+
41
+ # Web Framework
42
+ flask==2.3.3
43
+ chainlit
44
+
45
+ # Utilities
46
+ python-dotenv
47
+ humanfriendly
48
+ pandas
49
+ tqdm
requirements-docker.txt ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core document processing
2
+ docling
3
+ html2text
4
+
5
+ # Asyncio support
6
+ nest_asyncio
7
+
8
+ # PyTorch CPU-only (much smaller than GPU)
9
+ --extra-index-url https://download.pytorch.org/whl/cpu
10
+ torch==2.6.0+cpu
11
+
12
+ # JSON parsing
13
+ orjson>=3.8.0
14
+ json-repair>=0.7.0
15
+
16
+ # Vector DB - Supports both cloud (Zilliz) and local (Milvus Lite)
17
+ pymilvus==2.5.5
18
+ milvus-lite==2.4.11
19
+
20
+ # LLM Integration
21
+ litellm
22
+
23
+ # LlamaIndex
24
+ llama-index
25
+ llama-index-embeddings-huggingface
26
+ llama-index-llms-litellm
27
+ llama-index-llms-ollama
28
+ llama-index-vector-stores-milvus==0.5.0
29
+
30
+ # Graph Database
31
+ neo4j
32
+ networkx
33
+ python-louvain
34
+ igraph
35
+ leidenalg
36
+ graspologic
37
+
38
+ # Graph LLM APIs
39
+ google-generativeai
40
+ openai>=1.0.0
41
+ fastmcp
42
+
43
+ # Web Frameworks
44
+ flask==2.3.3
45
+ chainlit
46
+
47
+ # Utilities
48
+ python-dotenv
49
+ humanfriendly
50
+ pandas
51
+ tqdm
requirements-runtime.txt ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ============================================
2
+ # RUNTIME REQUIREMENTS - Flask GraphRAG App
3
+ # ============================================
4
+ # These are the MINIMAL packages needed to run the Flask GraphRAG app
5
+ # after the pipeline has been completed.
6
+ # Use this for production deployments to save ~500 MB RAM
7
+
8
+ # ============================================
9
+ # Core Runtime - DO NOT REMOVE
10
+ # ============================================
11
+
12
+ # Asyncio support
13
+ nest_asyncio
14
+
15
+ # Advanced JSON parsing (for LLM responses)
16
+ orjson>=3.8.0
17
+ json-repair>=0.7.0
18
+
19
+ # Vector Database (client-only, lightweight)
20
+ pymilvus==2.5.5
21
+
22
+ # LLM Integration
23
+ litellm
24
+
25
+ # LlamaIndex Core (for querying)
26
+ llama-index
27
+ llama-index-embeddings-huggingface
28
+ llama-index-llms-litellm
29
+ llama-index-llms-ollama
30
+ llama-index-vector-stores-milvus==0.5.0
31
+
32
+ # Graph Database (client-only)
33
+ neo4j
34
+
35
+ # Graph Analysis (for community queries)
36
+ networkx
37
+
38
+ # LLM APIs (for query synthesis)
39
+ google-generativeai
40
+ openai>=1.0.0
41
+
42
+ # Web Framework
43
+ flask==2.3.3
44
+ chainlit # Chat UI (only if using chainlit apps)
45
+
46
+ # Utilities
47
+ python-dotenv
48
+ humanfriendly
49
+ pandas
50
+
51
+ # ============================================
52
+ # Embedding Model - LARGE (500+ MB)
53
+ # ============================================
54
+ # PyTorch CPU (required for sentence-transformers/embeddings)
55
+ --extra-index-url https://download.pytorch.org/whl/cpu
56
+ torch==2.6.0+cpu
57
+
58
+ # Note: For cloud embeddings in future, remove torch + sentence-transformers
59
+ # to save ~500 MB
requirements.txt ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core document processing
2
+ docling
3
+ html2text
4
+ # Asyncio support
5
+ nest_asyncio
6
+
7
+ # PyTorch CPU-only (much smaller than GPU)
8
+ --extra-index-url https://download.pytorch.org/whl/cpu
9
+ torch==2.6.0+cpu
10
+
11
+ # Advanced JSON parsing
12
+ orjson>=3.8.0
13
+ json-repair>=0.7.0
14
+
15
+ # Vector Database
16
+ pymilvus==2.5.5
17
+ milvus-lite==2.4.11
18
+
19
+ # LLM Integration
20
+ litellm
21
+
22
+ # LlamaIndex
23
+ llama-index
24
+ llama-index-embeddings-huggingface
25
+ llama-index-llms-litellm
26
+ llama-index-llms-ollama
27
+ llama-index-vector-stores-milvus==0.5.0
28
+
29
+ # Graph Database
30
+ neo4j
31
+ networkx
32
+ python-louvain
33
+ igraph
34
+ leidenalg
35
+ graspologic
36
+
37
+ # Graph LLM APIs
38
+ google-generativeai
39
+ openai>=1.0.0
40
+ fastmcp
41
+
42
+ # Web Frameworks
43
+ flask==2.3.3
44
+ chainlit
45
+
46
+ # Utilities
47
+ python-dotenv
48
+ humanfriendly
49
+ pandas
50
+ tqdm
51
+ ipykernel
uv.lock ADDED
The diff for this file is too large to render. See raw diff