Jaja-09 commited on
Commit
fb51816
·
1 Parent(s): fa89dc2
Files changed (3) hide show
  1. Dockerfile +2 -0
  2. app.py +10 -9
  3. model_handler.py +334 -113
Dockerfile CHANGED
@@ -24,3 +24,5 @@ RUN mkdir -p /app/nltk_data /app/hf_cache && \
24
 
25
  EXPOSE 7860
26
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
 
 
24
 
25
  EXPOSE 7860
26
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
27
+
28
+ #
app.py CHANGED
@@ -28,7 +28,7 @@ app = FastAPI(
28
  # Configure CORS
29
  app.add_middleware(
30
  CORSMiddleware,
31
- allow_origins=["*"], # In production, replace with your frontend URL
32
  allow_credentials=True,
33
  allow_methods=["*"],
34
  allow_headers=["*"],
@@ -142,7 +142,7 @@ async def health_check():
142
  }
143
 
144
 
145
- @app.post("/api/analyze", response_model=AnalysisResponse)
146
  async def analyze_text(request: AnalysisRequest):
147
  """
148
  Analyze text using the DeBERTa AI detection model
@@ -158,12 +158,12 @@ async def analyze_text(request: AnalysisRequest):
158
  if not request.text or len(request.text.strip()) == 0:
159
  raise HTTPException(status_code=400, detail="Text cannot be empty")
160
 
161
- # Check text length for meaningful analysis (80-7000 words)
162
  word_count = len(request.text.split())
163
- if word_count < 80:
164
  raise HTTPException(
165
  status_code=400,
166
- detail="Text is too short for analysis. Please provide at least 80 words for accurate AI detection and sentiment analysis."
167
  )
168
 
169
  if word_count > 7000:
@@ -200,12 +200,12 @@ async def detect_ai(request: AnalysisRequest):
200
  if not request.text or len(request.text.strip()) == 0:
201
  raise HTTPException(status_code=400, detail="Text cannot be empty")
202
 
203
- # Check text length (80-7000 words)
204
  word_count = len(request.text.split())
205
- if word_count < 80:
206
  raise HTTPException(
207
  status_code=400,
208
- detail="Text is too short. Please provide at least 80 words."
209
  )
210
  elif word_count > 7000:
211
  raise HTTPException(
@@ -222,7 +222,8 @@ async def detect_ai(request: AnalysisRequest):
222
  "probability": result["probability"],
223
  "confidence": result["confidence"],
224
  "explanation": result["explanation"],
225
- "mixed_analysis": result.get("mixed_analysis", None)
 
226
  }
227
 
228
  except HTTPException:
 
28
  # Configure CORS
29
  app.add_middleware(
30
  CORSMiddleware,
31
+ allow_origins=["*"],
32
  allow_credentials=True,
33
  allow_methods=["*"],
34
  allow_headers=["*"],
 
142
  }
143
 
144
 
145
+ @app.post("/api/analyze")
146
  async def analyze_text(request: AnalysisRequest):
147
  """
148
  Analyze text using the DeBERTa AI detection model
 
158
  if not request.text or len(request.text.strip()) == 0:
159
  raise HTTPException(status_code=400, detail="Text cannot be empty")
160
 
161
+ # Check text length for meaningful analysis (50-7000 words)
162
  word_count = len(request.text.split())
163
+ if word_count < 50:
164
  raise HTTPException(
165
  status_code=400,
166
+ detail="Text is too short for analysis. Please provide at least 50 words for accurate AI detection and sentiment analysis."
167
  )
168
 
169
  if word_count > 7000:
 
200
  if not request.text or len(request.text.strip()) == 0:
201
  raise HTTPException(status_code=400, detail="Text cannot be empty")
202
 
203
+ # Check text length (50-7000 words)
204
  word_count = len(request.text.split())
205
+ if word_count < 50:
206
  raise HTTPException(
207
  status_code=400,
208
+ detail="Text is too short. Please provide at least 50 words."
209
  )
210
  elif word_count > 7000:
211
  raise HTTPException(
 
222
  "probability": result["probability"],
223
  "confidence": result["confidence"],
224
  "explanation": result["explanation"],
225
+ "mixed_analysis": result.get("mixed_analysis", None),
226
+ "modelProcessingTime": result.get("modelProcessingTime", None)
227
  }
228
 
229
  except HTTPException:
model_handler.py CHANGED
@@ -10,6 +10,7 @@ import torch.nn.functional as F
10
  from transformers import AutoTokenizer, AutoConfig, AutoModel, PreTrainedModel, AutoModelForSequenceClassification
11
  import os
12
  import logging
 
13
  from typing import Dict, Any, Optional, List, Tuple
14
  import numpy as np
15
  from pathlib import Path
@@ -18,11 +19,17 @@ import json
18
  import nltk
19
  from nltk.tokenize import sent_tokenize
20
 
 
21
  # Download NLTK data
22
  try:
23
  nltk.data.find('tokenizers/punkt')
24
  except LookupError:
25
  nltk.download('punkt', quiet=True)
 
 
 
 
 
26
 
27
  logger = logging.getLogger(__name__)
28
 
@@ -80,7 +87,7 @@ class AIDetectionModelHandler:
80
  Initialize the model handler
81
 
82
  Args:
83
- model_path: Path to the model directory (default: env MODEL_PATH or /app/model)
84
  max_length: Maximum token length for input text
85
  """
86
  self.max_length = max_length
@@ -94,20 +101,11 @@ class AIDetectionModelHandler:
94
 
95
  # Default model paths
96
  if model_path is None:
97
- # Prefer explicit env var
98
- env_model_path = os.getenv("MODEL_PATH")
99
- if env_model_path and os.path.exists(env_model_path):
100
- model_path = env_model_path
101
- elif os.path.exists("/app/model"):
102
- model_path = "/app/model"
103
- else:
104
- # Fallback to legacy relative path
105
- backend_dir = Path(__file__).parent
106
- model_path = str(backend_dir.parent / "model" / "model")
107
 
108
  self.model_path = model_path
109
- # XGBoost file is expected inside the same folder as the other model artifacts
110
- self.xgboost_path = str(Path(model_path) / "xgboost_model.json")
111
 
112
  # Load the models
113
  self._load_models()
@@ -115,6 +113,9 @@ class AIDetectionModelHandler:
115
  def _load_models(self):
116
  """Load DeBERTa, sentiment model, and XGBoost classifier"""
117
  try:
 
 
 
118
  logger.info(f"Loading models from: {self.model_path}")
119
  logger.info(f"Using device: {self.device}")
120
 
@@ -132,6 +133,8 @@ class AIDetectionModelHandler:
132
  self.deberta_model.to(self.device)
133
  self.deberta_model.eval()
134
 
 
 
135
  # 2. Load sentiment analysis model (DistilBERT)
136
  logger.info("Loading sentiment model...")
137
  sentiment_model_name = "distilbert-base-uncased-finetuned-sst-2-english"
@@ -140,16 +143,40 @@ class AIDetectionModelHandler:
140
  self.sentiment_model.to(self.device)
141
  self.sentiment_model.eval()
142
 
 
 
143
  # 3. Load XGBoost model
144
  if os.path.exists(self.xgboost_path):
145
  logger.info(f"Loading XGBoost model from: {self.xgboost_path}")
 
146
  self.xgboost_model = xgb.Booster()
147
  self.xgboost_model.load_model(self.xgboost_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  logger.info("✅ XGBoost model loaded!")
149
  else:
150
  logger.warning(f"XGBoost model not found at {self.xgboost_path}, using DeBERTa only")
151
  self.xgboost_model = None
152
 
 
 
 
 
153
  self.model_loaded = True
154
  logger.info("✅ All models loaded successfully!")
155
 
@@ -179,9 +206,11 @@ class AIDetectionModelHandler:
179
  return [0.5] # Neutral if no sentences
180
 
181
  scores = []
182
-
 
183
  with torch.no_grad():
184
- for sentence in sentences:
 
185
  # Tokenize sentence
186
  inputs = self.sentiment_tokenizer(
187
  sentence,
@@ -192,6 +221,7 @@ class AIDetectionModelHandler:
192
  )
193
  inputs = {k: v.to(self.device) for k, v in inputs.items()}
194
 
 
195
  # Get sentiment prediction
196
  outputs = self.sentiment_model(**inputs)
197
  logits = outputs.logits
@@ -202,7 +232,10 @@ class AIDetectionModelHandler:
202
  # Convert to polarity score (-1 to 1, where 0.5 is neutral)
203
  polarity = (pos_prob - 0.5) * 2 # Maps [0,1] to [-1,1]
204
  scores.append(polarity)
205
-
 
 
 
206
  return scores
207
 
208
  except Exception as e:
@@ -219,12 +252,16 @@ class AIDetectionModelHandler:
219
  Returns:
220
  Numpy array with [avg_polarity, polarity_variance]
221
  """
 
222
  sentiment_scores = self.get_sentiment_scores(text)
223
 
224
  # Calculate features
225
  avg_polarity = float(np.mean(sentiment_scores)) if sentiment_scores else 0.0
226
  polarity_variance = float(np.var(sentiment_scores)) if len(sentiment_scores) > 1 else 0.0
227
 
 
 
 
228
  return np.array([avg_polarity, polarity_variance], dtype=np.float32)
229
 
230
  def get_deberta_embeddings(self, text: str) -> np.ndarray:
@@ -238,7 +275,9 @@ class AIDetectionModelHandler:
238
  Numpy array of embeddings
239
  """
240
  try:
 
241
  # Tokenize input
 
242
  encoded = self.tokenizer(
243
  text,
244
  padding='max_length',
@@ -247,23 +286,33 @@ class AIDetectionModelHandler:
247
  return_tensors='pt'
248
  )
249
 
 
 
 
250
  input_ids = encoded['input_ids'].to(self.device)
251
  attention_mask = encoded['attention_mask'].to(self.device)
252
 
253
  # Get embeddings
254
  with torch.no_grad():
 
255
  outputs = self.deberta_model.model(input_ids=input_ids, attention_mask=attention_mask)
 
 
 
256
  last_hidden_state = outputs[0]
257
 
258
  # Mean pooling
 
259
  input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
260
  sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, dim=1)
261
  sum_mask = torch.clamp(input_mask_expanded.sum(dim=1), min=1e-9)
262
  pooled_output = sum_embeddings / sum_mask
 
 
263
 
264
  # Convert to numpy
265
  embeddings = pooled_output.cpu().numpy().flatten()
266
-
267
  return embeddings
268
 
269
  except Exception as e:
@@ -285,19 +334,26 @@ class AIDetectionModelHandler:
285
  raise RuntimeError("Model not loaded. Cannot perform prediction.")
286
 
287
  try:
 
288
  # Extract sentiment features
289
  logger.info("Extracting sentiment features...")
 
290
  sentiment_features = self.extract_sentiment_features(text)
 
291
  avg_polarity = float(sentiment_features[0])
292
  polarity_variance = float(sentiment_features[1])
293
-
 
294
  # If XGBoost is available, use the full two-branch pipeline
295
  if self.xgboost_model is not None:
296
  logger.info("Using XGBoost two-branch model...")
297
-
 
298
  # Get DeBERTa embeddings
299
  deberta_embeddings = self.get_deberta_embeddings(text)
300
-
 
 
301
  # Combine features: DeBERTa embeddings + sentiment features
302
  combined_features = np.concatenate([deberta_embeddings, sentiment_features])
303
 
@@ -305,8 +361,11 @@ class AIDetectionModelHandler:
305
  dmatrix = xgb.DMatrix(combined_features.reshape(1, -1))
306
 
307
  # Predict
 
308
  probability = float(self.xgboost_model.predict(dmatrix)[0])
309
-
 
 
310
  else:
311
  # Fallback to DeBERTa only
312
  logger.info("Using DeBERTa model only (XGBoost not found)...")
@@ -323,11 +382,16 @@ class AIDetectionModelHandler:
323
  attention_mask = encoded['attention_mask'].to(self.device)
324
 
325
  with torch.no_grad():
 
326
  outputs = self.deberta_model(input_ids=input_ids, attention_mask=attention_mask)
 
 
327
  logits = outputs["logits"]
328
  probability = torch.sigmoid(logits).item()
329
 
330
  label = 1 if probability >= threshold else 0
 
 
331
 
332
  return {
333
  "probability": probability,
@@ -355,6 +419,7 @@ class AIDetectionModelHandler:
355
  Tuple of (probability, label) where label is 0 for human, 1 for AI
356
  """
357
  try:
 
358
  # Extract sentiment features
359
  sentiment_features = self.extract_sentiment_features(text)
360
  avg_polarity = float(sentiment_features[0])
@@ -362,18 +427,24 @@ class AIDetectionModelHandler:
362
 
363
  # If XGBoost is available, use the full two-branch pipeline
364
  if self.xgboost_model is not None:
 
365
  # Get DeBERTa embeddings
366
  deberta_embeddings = self.get_deberta_embeddings(text)
367
-
 
 
368
  # Combine features: DeBERTa embeddings + sentiment features
369
  combined_features = np.concatenate([deberta_embeddings, sentiment_features])
370
 
371
  # Create DMatrix for XGBoost
372
  dmatrix = xgb.DMatrix(combined_features.reshape(1, -1))
373
-
 
374
  # Predict
375
  probability = float(self.xgboost_model.predict(dmatrix)[0])
376
-
 
 
377
  else:
378
  # Fallback to DeBERTa only
379
  encoded = self.tokenizer(
@@ -388,12 +459,16 @@ class AIDetectionModelHandler:
388
  attention_mask = encoded['attention_mask'].to(self.device)
389
 
390
  with torch.no_grad():
 
391
  outputs = self.deberta_model(input_ids=input_ids, attention_mask=attention_mask)
 
 
392
  logits = outputs["logits"]
393
  probability = torch.sigmoid(logits).item()
394
 
395
  label = 1 if probability >= 0.5 else 0
396
-
 
397
  return probability, label
398
 
399
  except Exception as e:
@@ -414,7 +489,7 @@ class AIDetectionModelHandler:
414
  Dictionary with prediction results and analysis details
415
 
416
  Note:
417
- Input validation: Text must be 200-7000 words. Dynamic chunking: 4-5 sentences
418
  analyzed as whole, then chunk size varies:
419
  - 6-10 sentences: 3 sentences per chunk
420
  - 11-20 sentences: 4 sentences per chunk
@@ -424,11 +499,11 @@ class AIDetectionModelHandler:
424
  """
425
  # Get overall prediction (your current method)
426
  overall_prob, overall_label = self.predict_single_text_xgboost(text)
427
-
428
  # Split text into sentences
429
  sentences = sent_tokenize(text)
430
 
431
- # Validate input text length (80-7000 words)
432
  total_words = len(text.split())
433
  if total_words < 80:
434
  return {
@@ -440,17 +515,31 @@ class AIDetectionModelHandler:
440
  'modified_probability': overall_prob,
441
  'chunk_analysis': []
442
  }
443
- elif total_words > 7000:
444
  return {
445
  'prediction': 'Human' if overall_label == 0 else 'AI',
446
  'confidence': abs(overall_prob - 0.5) * 2,
447
  'is_mixed': False,
448
- 'reason': f'Text too long for analysis ({total_words} words, maximum 7000 words allowed)',
449
  'overall_probability': overall_prob,
450
  'modified_probability': overall_prob,
451
  'chunk_analysis': []
452
  }
453
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
454
  # Dynamic chunking based on total sentence count
455
  total_sentences = len(sentences)
456
 
@@ -488,28 +577,34 @@ class AIDetectionModelHandler:
488
  'chunk_analysis': []
489
  }
490
 
491
- # Create overlapping chunks
492
- chunks = []
493
- chunk_predictions = []
494
- chunk_probabilities = []
495
-
 
496
  logger.info(f"Analyzing text with {total_sentences} sentences using dynamic chunk size of {dynamic_chunk_size}...")
497
-
498
  for i in range(0, len(sentences) - dynamic_chunk_size + 1, dynamic_chunk_size - overlap):
499
  # Create chunk from sentences
500
- chunk_sentences = sentences[i:i + dynamic_chunk_size]
 
 
501
  chunk_text = ' '.join(chunk_sentences)
502
-
503
  # Only analyze chunks that meet minimum length requirement
504
  if len(chunk_text.strip()) >= min_chunk_length:
505
  chunks.append(chunk_text)
506
-
 
507
  # Analyze this chunk
508
  prob, label = self.predict_single_text_xgboost(chunk_text)
509
  chunk_predictions.append((prob, label))
510
  chunk_probabilities.append(prob)
511
-
512
  logger.info(f" Chunk {len(chunks)}: {chunk_text[:60]}... → {'AI' if label == 1 else 'Human'} ({prob:.3f})")
 
 
513
 
514
  if len(chunk_predictions) < 2:
515
  return {
@@ -519,6 +614,10 @@ class AIDetectionModelHandler:
519
  'reason': 'Too few chunks for mixed analysis',
520
  'overall_probability': overall_prob,
521
  'modified_probability': overall_prob,
 
 
 
 
522
  'chunk_analysis': chunk_predictions
523
  }
524
 
@@ -530,68 +629,148 @@ class AIDetectionModelHandler:
530
  # Mixed text detection logic
531
  is_mixed = human_chunks > 0 and ai_chunks > 0
532
  mixed_ratio = min(human_chunks, ai_chunks) / total_chunks
 
 
533
 
534
  logger.info(f"\nChunk Analysis Summary:")
535
  logger.info(f" Total chunks analyzed: {total_chunks}")
536
  logger.info(f" Human chunks: {human_chunks}")
537
  logger.info(f" AI chunks: {ai_chunks}")
538
  logger.info(f" Mixed ratio: {mixed_ratio:.2f}")
 
 
539
 
540
- # MODIFY OVERALL PROBABILITY BASED ON CHUNK ANALYSIS
541
- if is_mixed and mixed_ratio > 0.25: # At least 25% of each type
542
- # Calculate weighted average of chunk probabilities
543
- # Weight by chunk length (longer chunks have more influence)
544
- chunk_weights = [len(chunk) for chunk in chunks]
545
- total_weight = sum(chunk_weights)
546
-
547
- # Calculate weighted average probability
548
- weighted_prob = sum(prob * weight for prob, weight in zip(chunk_probabilities, chunk_weights)) / total_weight
549
-
550
- # Blend original overall probability with chunk-based probability
551
- # More chunks = more influence from chunk analysis
552
- chunk_influence = min(total_chunks / 5.0, 1.0) # Max influence at 5+ chunks
553
- modified_prob = (overall_prob * (1 - chunk_influence)) + (weighted_prob * chunk_influence)
554
-
555
  final_prediction = 'Mixed'
556
- confidence = 1.0 - mixed_ratio # Lower confidence for mixed text
557
-
558
- logger.info(f" → MIXED TEXT DETECTED!")
559
- logger.info(f" → Original overall probability: {overall_prob:.3f}")
560
- logger.info(f" → Weighted chunk probability: {weighted_prob:.3f}")
561
- logger.info(f" → Chunk influence factor: {chunk_influence:.3f}")
562
- logger.info(f" → Modified probability: {modified_prob:.3f}")
563
-
564
  else:
565
- # Pure text - use chunk analysis to refine overall probability
566
- chunk_avg_prob = np.mean(chunk_probabilities)
567
-
568
- # Blend overall and chunk probabilities (chunks have 30% influence for pure text)
569
- modified_prob = (overall_prob * 0.7) + (chunk_avg_prob * 0.3)
570
-
571
- final_prediction = 'Human' if modified_prob < 0.5 else 'AI'
572
- # Base confidence from modified probability (0..1)
573
- base_confidence = abs(modified_prob - 0.5) * 2
574
-
575
- # For short texts/few chunks, incorporate chunk-majority evidence to avoid
576
- # under-confident results when the label is clear but probability is near 0.5.
577
- if total_chunks > 0:
578
- majority_ratio = max(human_chunks, ai_chunks) / total_chunks # e.g., 3/4 => 0.75
579
- combined_confidence = max(
580
- base_confidence,
581
- 0.6 * majority_ratio + 0.4 * base_confidence
582
- )
583
- # If every chunk agrees, ensure a reasonable floor
584
- if majority_ratio == 1.0 and total_chunks >= 3:
585
- combined_confidence = max(combined_confidence, 0.85)
586
- confidence = min(0.99, combined_confidence)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
587
  else:
588
- confidence = base_confidence
589
-
590
- logger.info(f" → Pure {final_prediction} text")
591
- logger.info(f" → Original overall probability: {overall_prob:.3f}")
592
- logger.info(f" → Average chunk probability: {chunk_avg_prob:.3f}")
593
- logger.info(f" → Modified probability: {modified_prob:.3f}")
594
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
595
  return {
596
  'prediction': final_prediction,
597
  'confidence': confidence,
@@ -604,9 +783,14 @@ class AIDetectionModelHandler:
604
  'modified_probability': modified_prob,
605
  'chunk_probabilities': chunk_probabilities,
606
  'chunk_analysis': chunk_predictions,
 
 
 
607
  'chunk_size': chunk_size,
608
  'overlap': overlap
609
  }
 
 
610
 
611
  def detect_ai(self, text: str) -> Dict[str, Any]:
612
  """
@@ -654,24 +838,24 @@ class AIDetectionModelHandler:
654
 
655
  elif prediction == "AI":
656
  explanation = f"This text is classified as AI-Generated with {certainty}."
657
- explanation += " The text exhibits patterns typical of AI-generated content, including consistent structure and predictable phrasing."
658
  if pol_var <= 0.10:
659
- explanation += " Very low emotional variation which is typical of AI texts with uniform style."
660
  elif pol_var <= 0.35:
661
- explanation += " Low emotional variation which is common in AI-generated content."
662
  elif pol_var <= 0.60:
663
- explanation += " Moderate emotional variation which is rare in AI, possibly presenting multiple viewpoints."
664
  else:
665
- explanation += " High emotional variation is unusual for AI, may indicate balanced argument structure."
666
  else: # Human
667
  explanation = f"This text is classified as Human-Authored with {certainty}."
668
- explanation += " The text shows characteristics of human writing, such as natural variations and organic flow."
669
  if pol_var > 0.60:
670
- explanation += " High emotional variation which is typical of human writing with emotional swings in debates, reviews, and narratives."
671
  elif pol_var >= 0.36:
672
- explanation += " Moderate emotional variation which shows human-like sentiment shifts."
673
  elif pol_var >= 0.11:
674
- explanation += " Low emotional variation which may indicate formal or academic human writing."
675
  else:
676
  explanation += " Very low emotional variation indicates consistent tone with focused perspective."
677
 
@@ -689,6 +873,7 @@ class AIDetectionModelHandler:
689
  "avg_polarity": avg_pol,
690
  "polarity_variance": pol_var
691
  },
 
692
  "mixed_analysis": {
693
  "is_mixed": chunk_result["is_mixed"],
694
  "mixed_ratio": chunk_result.get("mixed_ratio", 0),
@@ -697,10 +882,16 @@ class AIDetectionModelHandler:
697
  "total_chunks": chunk_result.get("total_chunks", 0),
698
  "overall_probability": chunk_result["overall_probability"],
699
  "modified_probability": chunk_result["modified_probability"]
700
- }
 
 
 
 
701
  }
 
702
 
703
  def analyze_text(self, text: str) -> Dict[str, Any]:
 
704
  """
705
  Comprehensive text analysis combining AI detection with sentiment features
706
 
@@ -710,15 +901,34 @@ class AIDetectionModelHandler:
710
  Returns:
711
  Complete analysis results with model-based sentiment features
712
  """
713
- # Validate input text length (80-7000 words)
714
  total_words = len(text.split())
715
  if total_words < 80:
716
  raise ValueError(f"Text too short for analysis ({total_words} words, minimum 80 words required)")
717
- elif total_words > 7000:
718
- raise ValueError(f"Text too long for analysis ({total_words} words, maximum 7000 words allowed)")
719
 
720
  # Get AI detection results (includes sentiment features from model)
721
  ai_detection = self.detect_ai(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
722
  model_sentiment = ai_detection.get("sentiment_features", {})
723
 
724
  # Perform basic text analysis
@@ -734,7 +944,6 @@ class AIDetectionModelHandler:
734
  # Determine complexity based on AI probability and text metrics
735
  is_ai = ai_detection["classification"] == "ai"
736
  is_mixed = ai_detection["classification"] == "mixed"
737
- ai_prob = ai_detection["probability"]
738
 
739
  # Handle different prediction types
740
  if is_mixed:
@@ -757,7 +966,6 @@ class AIDetectionModelHandler:
757
  insights = []
758
 
759
  if is_mixed and ai_detection["confidence"] > 60:
760
- mixed_analysis = ai_detection.get("mixed_analysis", {})
761
  insights.append({
762
  "type": "observation",
763
  "title": "Mixed Content Detected",
@@ -774,14 +982,14 @@ class AIDetectionModelHandler:
774
  insights.append({
775
  "type": "observation",
776
  "title": "AI-Generated Content Detected",
777
- "description": f"This text shows strong indicators of AI generation ({ai_detection['confidence']:.1f}% confidence).",
778
- "suggestion": "Consider adding personal anecdotes, varied sentence structures, or unique perspectives to make it more human-like."
779
  })
780
  elif not is_ai and ai_detection["confidence"] > 75:
781
  insights.append({
782
  "type": "strength",
783
  "title": "Human Writing Characteristics",
784
- "description": f"This text exhibits clear human writing patterns ({ai_detection['confidence']:.1f}% confidence)."
785
  })
786
 
787
  # Sentence variety analysis
@@ -792,8 +1000,8 @@ class AIDetectionModelHandler:
792
  insights.append({
793
  "type": "improvement",
794
  "title": "Sentence Variety",
795
- "description": "Sentences have similar lengths, which may indicate AI generation.",
796
- "suggestion": "Vary sentence lengths to create more natural rhythm."
797
  })
798
  else:
799
  insights.append({
@@ -818,6 +1026,11 @@ class AIDetectionModelHandler:
818
  # Construct full analysis response with model sentiment features
819
  polarity_variance = model_sentiment.get("polarity_variance", 0)
820
 
 
 
 
 
 
821
  return {
822
  "advancedSentiment": {
823
  "emotions": emotions,
@@ -856,7 +1069,13 @@ class AIDetectionModelHandler:
856
  },
857
  "aiOrHuman": ai_detection["classification"],
858
  "aiOrHumanConfidence": ai_detection["confidence"],
859
- "aiOrHumanExplanation": ai_detection["explanation"]
 
 
 
 
 
 
860
  }
861
 
862
  def get_model_info(self) -> Dict[str, Any]:
@@ -877,3 +1096,5 @@ class AIDetectionModelHandler:
877
  ],
878
  "description": "Two-branch model for detecting AI-Generated vs Human-Authored text using DeBERTa semantic embeddings combined with sentiment features"
879
  }
 
 
 
10
  from transformers import AutoTokenizer, AutoConfig, AutoModel, PreTrainedModel, AutoModelForSequenceClassification
11
  import os
12
  import logging
13
+ import time
14
  from typing import Dict, Any, Optional, List, Tuple
15
  import numpy as np
16
  from pathlib import Path
 
19
  import nltk
20
  from nltk.tokenize import sent_tokenize
21
 
22
+
23
  # Download NLTK data
24
  try:
25
  nltk.data.find('tokenizers/punkt')
26
  except LookupError:
27
  nltk.download('punkt', quiet=True)
28
+
29
+ try:
30
+ nltk.data.find('tokenizers/punkt_tab')
31
+ except LookupError:
32
+ nltk.download('punkt_tab', quiet=True)
33
 
34
  logger = logging.getLogger(__name__)
35
 
 
87
  Initialize the model handler
88
 
89
  Args:
90
+ model_path: Path to the model directory (default: ../model/model)
91
  max_length: Maximum token length for input text
92
  """
93
  self.max_length = max_length
 
101
 
102
  # Default model paths
103
  if model_path is None:
104
+ backend_dir = Path(__file__).parent
105
+ model_path = str(backend_dir.parent / "model" / "model")
 
 
 
 
 
 
 
 
106
 
107
  self.model_path = model_path
108
+ self.xgboost_path = str(Path(model_path).parent / "xgboost_model.json")
 
109
 
110
  # Load the models
111
  self._load_models()
 
113
  def _load_models(self):
114
  """Load DeBERTa, sentiment model, and XGBoost classifier"""
115
  try:
116
+ logger.info(f"CUDA available: {torch.cuda.is_available()}")
117
+ logger.info(f"Selected device: {self.device}")
118
+
119
  logger.info(f"Loading models from: {self.model_path}")
120
  logger.info(f"Using device: {self.device}")
121
 
 
133
  self.deberta_model.to(self.device)
134
  self.deberta_model.eval()
135
 
136
+ print("DeBERTa model device:", next(self.deberta_model.parameters()).device)
137
+
138
  # 2. Load sentiment analysis model (DistilBERT)
139
  logger.info("Loading sentiment model...")
140
  sentiment_model_name = "distilbert-base-uncased-finetuned-sst-2-english"
 
143
  self.sentiment_model.to(self.device)
144
  self.sentiment_model.eval()
145
 
146
+ print("Sentiment model device:", next(self.sentiment_model.parameters()).device)
147
+
148
  # 3. Load XGBoost model
149
  if os.path.exists(self.xgboost_path):
150
  logger.info(f"Loading XGBoost model from: {self.xgboost_path}")
151
+ t0 = time.perf_counter()
152
  self.xgboost_model = xgb.Booster()
153
  self.xgboost_model.load_model(self.xgboost_path)
154
+ # Force GPU or CPU depending on hardware
155
+ if torch.cuda.is_available():
156
+ logger.info("Setting XGBoost to use GPU predictor")
157
+ try:
158
+ self.xgboost_model.set_param({"predictor": "gpu_predictor", "tree_method": "gpu_hist"})
159
+ logger.info("XGBoost configured to use GPU (gpu_predictor, gpu_hist)")
160
+ except Exception as ie:
161
+ logger.warning(f"Failed to set XGBoost GPU params: {ie}")
162
+ else:
163
+ logger.info("Setting XGBoost to use CPU predictor")
164
+ try:
165
+ self.xgboost_model.set_param({"predictor": "cpu_predictor", "tree_method": "hist"})
166
+ except Exception as ie:
167
+ logger.warning(f"Failed to set XGBoost CPU params: {ie}")
168
+
169
+ t1 = time.perf_counter()
170
+ logger.info(f"XGBoost model loaded in {t1 - t0:.4f}s")
171
  logger.info("✅ XGBoost model loaded!")
172
  else:
173
  logger.warning(f"XGBoost model not found at {self.xgboost_path}, using DeBERTa only")
174
  self.xgboost_model = None
175
 
176
+ # 🔍 OPTIONAL: PRINT GPU NAME
177
+ if torch.cuda.is_available():
178
+ print("GPU detected:", torch.cuda.get_device_name(0))
179
+
180
  self.model_loaded = True
181
  logger.info("✅ All models loaded successfully!")
182
 
 
206
  return [0.5] # Neutral if no sentences
207
 
208
  scores = []
209
+ start_total = time.perf_counter()
210
+
211
  with torch.no_grad():
212
+ for i, sentence in enumerate(sentences):
213
+ s0 = time.perf_counter()
214
  # Tokenize sentence
215
  inputs = self.sentiment_tokenizer(
216
  sentence,
 
221
  )
222
  inputs = {k: v.to(self.device) for k, v in inputs.items()}
223
 
224
+
225
  # Get sentiment prediction
226
  outputs = self.sentiment_model(**inputs)
227
  logits = outputs.logits
 
232
  # Convert to polarity score (-1 to 1, where 0.5 is neutral)
233
  polarity = (pos_prob - 0.5) * 2 # Maps [0,1] to [-1,1]
234
  scores.append(polarity)
235
+ s1 = time.perf_counter()
236
+ logger.debug(f"Sentiment sentence processed in {s1 - s0:.4f}s")
237
+ total_time = time.perf_counter() - start_total
238
+ logger.info(f"Extracted sentiment scores for {len(sentences)} sentences in {total_time:.4f}s")
239
  return scores
240
 
241
  except Exception as e:
 
252
  Returns:
253
  Numpy array with [avg_polarity, polarity_variance]
254
  """
255
+ start = time.perf_counter()
256
  sentiment_scores = self.get_sentiment_scores(text)
257
 
258
  # Calculate features
259
  avg_polarity = float(np.mean(sentiment_scores)) if sentiment_scores else 0.0
260
  polarity_variance = float(np.var(sentiment_scores)) if len(sentiment_scores) > 1 else 0.0
261
 
262
+ duration = time.perf_counter() - start
263
+ logger.info(f"Sentiment features extracted in {duration:.4f}s (avg_polarity={avg_polarity:.4f}, variance={polarity_variance:.4f})")
264
+
265
  return np.array([avg_polarity, polarity_variance], dtype=np.float32)
266
 
267
  def get_deberta_embeddings(self, text: str) -> np.ndarray:
 
275
  Numpy array of embeddings
276
  """
277
  try:
278
+ t_total = time.perf_counter()
279
  # Tokenize input
280
+ t0 = time.perf_counter()
281
  encoded = self.tokenizer(
282
  text,
283
  padding='max_length',
 
286
  return_tensors='pt'
287
  )
288
 
289
+ t1 = time.perf_counter()
290
+ logger.debug(f"Tokenization time: {t1 - t0:.4f}s")
291
+
292
  input_ids = encoded['input_ids'].to(self.device)
293
  attention_mask = encoded['attention_mask'].to(self.device)
294
 
295
  # Get embeddings
296
  with torch.no_grad():
297
+ t0 = time.perf_counter()
298
  outputs = self.deberta_model.model(input_ids=input_ids, attention_mask=attention_mask)
299
+ t1 = time.perf_counter()
300
+ logger.debug(f"Transformer forward pass time: {t1 - t0:.4f}s")
301
+
302
  last_hidden_state = outputs[0]
303
 
304
  # Mean pooling
305
+ t0 = time.perf_counter()
306
  input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
307
  sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, dim=1)
308
  sum_mask = torch.clamp(input_mask_expanded.sum(dim=1), min=1e-9)
309
  pooled_output = sum_embeddings / sum_mask
310
+ t1 = time.perf_counter()
311
+ logger.debug(f"Pooling time: {t1 - t0:.4f}s")
312
 
313
  # Convert to numpy
314
  embeddings = pooled_output.cpu().numpy().flatten()
315
+ total = time.perf_counter() - t_total
316
  return embeddings
317
 
318
  except Exception as e:
 
334
  raise RuntimeError("Model not loaded. Cannot perform prediction.")
335
 
336
  try:
337
+ overall_start = time.perf_counter()
338
  # Extract sentiment features
339
  logger.info("Extracting sentiment features...")
340
+ sentiment_start = time.perf_counter()
341
  sentiment_features = self.extract_sentiment_features(text)
342
+ sentiment_time = time.perf_counter() - sentiment_start
343
  avg_polarity = float(sentiment_features[0])
344
  polarity_variance = float(sentiment_features[1])
345
+ logger.info(f"Sentiment extraction took {sentiment_time:.4f}s")
346
+
347
  # If XGBoost is available, use the full two-branch pipeline
348
  if self.xgboost_model is not None:
349
  logger.info("Using XGBoost two-branch model...")
350
+ embed_start = time.perf_counter()
351
+
352
  # Get DeBERTa embeddings
353
  deberta_embeddings = self.get_deberta_embeddings(text)
354
+ embed_time = time.perf_counter() - embed_start
355
+ logger.info(f"DeBERTa embedding extraction took {embed_time:.4f}s")
356
+
357
  # Combine features: DeBERTa embeddings + sentiment features
358
  combined_features = np.concatenate([deberta_embeddings, sentiment_features])
359
 
 
361
  dmatrix = xgb.DMatrix(combined_features.reshape(1, -1))
362
 
363
  # Predict
364
+ xgb_start = time.perf_counter()
365
  probability = float(self.xgboost_model.predict(dmatrix)[0])
366
+ xgb_time = time.perf_counter() - xgb_start
367
+ logger.info(f"XGBoost prediction took {xgb_time:.4f}s")
368
+
369
  else:
370
  # Fallback to DeBERTa only
371
  logger.info("Using DeBERTa model only (XGBoost not found)...")
 
382
  attention_mask = encoded['attention_mask'].to(self.device)
383
 
384
  with torch.no_grad():
385
+ t0 = time.perf_counter()
386
  outputs = self.deberta_model(input_ids=input_ids, attention_mask=attention_mask)
387
+ t1 = time.perf_counter()
388
+ logger.info(f"DeBERTa forward & classification took {t1 - t0:.4f}s")
389
  logits = outputs["logits"]
390
  probability = torch.sigmoid(logits).item()
391
 
392
  label = 1 if probability >= threshold else 0
393
+ overall_time = time.perf_counter() - overall_start
394
+ logger.info(f"Total prediction pipeline took {overall_time:.4f}s (prob={probability:.4f})")
395
 
396
  return {
397
  "probability": probability,
 
419
  Tuple of (probability, label) where label is 0 for human, 1 for AI
420
  """
421
  try:
422
+ start_total = time.perf_counter()
423
  # Extract sentiment features
424
  sentiment_features = self.extract_sentiment_features(text)
425
  avg_polarity = float(sentiment_features[0])
 
427
 
428
  # If XGBoost is available, use the full two-branch pipeline
429
  if self.xgboost_model is not None:
430
+ embed_start = time.perf_counter()
431
  # Get DeBERTa embeddings
432
  deberta_embeddings = self.get_deberta_embeddings(text)
433
+ embed_time = time.perf_counter() - embed_start
434
+ logger.info(f"DeBERTa embedding extraction took {embed_time:.4f}s")
435
+
436
  # Combine features: DeBERTa embeddings + sentiment features
437
  combined_features = np.concatenate([deberta_embeddings, sentiment_features])
438
 
439
  # Create DMatrix for XGBoost
440
  dmatrix = xgb.DMatrix(combined_features.reshape(1, -1))
441
+ xgb_start = time.perf_counter()
442
+
443
  # Predict
444
  probability = float(self.xgboost_model.predict(dmatrix)[0])
445
+ xgb_time = time.perf_counter() - xgb_start
446
+ logger.info(f"XGBoost prediction (single) took {xgb_time:.4f}s")
447
+
448
  else:
449
  # Fallback to DeBERTa only
450
  encoded = self.tokenizer(
 
459
  attention_mask = encoded['attention_mask'].to(self.device)
460
 
461
  with torch.no_grad():
462
+ t0 = time.perf_counter()
463
  outputs = self.deberta_model(input_ids=input_ids, attention_mask=attention_mask)
464
+ t1 = time.perf_counter()
465
+ logger.info(f"DeBERTa forward (single) took {t1 - t0:.4f}s")
466
  logits = outputs["logits"]
467
  probability = torch.sigmoid(logits).item()
468
 
469
  label = 1 if probability >= 0.5 else 0
470
+ total = time.perf_counter() - start_total
471
+ logger.info(f"predict_single_text_xgboost total time: {total:.4f}s")
472
  return probability, label
473
 
474
  except Exception as e:
 
489
  Dictionary with prediction results and analysis details
490
 
491
  Note:
492
+ Input validation: Text must be 80-2000 words. Dynamic chunking: 4-5 sentences
493
  analyzed as whole, then chunk size varies:
494
  - 6-10 sentences: 3 sentences per chunk
495
  - 11-20 sentences: 4 sentences per chunk
 
499
  """
500
  # Get overall prediction (your current method)
501
  overall_prob, overall_label = self.predict_single_text_xgboost(text)
502
+
503
  # Split text into sentences
504
  sentences = sent_tokenize(text)
505
 
506
+ # Validate input text length (80-2000 words)
507
  total_words = len(text.split())
508
  if total_words < 80:
509
  return {
 
515
  'modified_probability': overall_prob,
516
  'chunk_analysis': []
517
  }
518
+ elif total_words > 2000:
519
  return {
520
  'prediction': 'Human' if overall_label == 0 else 'AI',
521
  'confidence': abs(overall_prob - 0.5) * 2,
522
  'is_mixed': False,
523
+ 'reason': f'Text too long for analysis ({total_words} words, maximum 2000 words allowed)',
524
  'overall_probability': overall_prob,
525
  'modified_probability': overall_prob,
526
  'chunk_analysis': []
527
  }
528
 
529
+ # Compute sentence character offsets (start/end) to map back to original text
530
+ sentence_offsets: List[Tuple[int, int]] = []
531
+ search_start = 0
532
+ for sent in sentences:
533
+ # find the sentence occurrence starting from search_start
534
+ idx = text.find(sent, search_start)
535
+ if idx == -1:
536
+ # fallback: skip whitespace and set to previous end
537
+ idx = search_start
538
+ start_char = idx
539
+ end_char = start_char + len(sent)
540
+ sentence_offsets.append((start_char, end_char))
541
+ search_start = end_char
542
+
543
  # Dynamic chunking based on total sentence count
544
  total_sentences = len(sentences)
545
 
 
577
  'chunk_analysis': []
578
  }
579
 
580
+ # Create overlapping chunks and retain sentence index ranges
581
+ chunks = [] # textual chunks (for backward compat)
582
+ chunk_sentence_ranges: List[Tuple[int, int]] = [] # inclusive start, inclusive end sentence idx
583
+ chunk_predictions: List[Tuple[float, int]] = []
584
+ chunk_probabilities: List[float] = []
585
+
586
  logger.info(f"Analyzing text with {total_sentences} sentences using dynamic chunk size of {dynamic_chunk_size}...")
587
+
588
  for i in range(0, len(sentences) - dynamic_chunk_size + 1, dynamic_chunk_size - overlap):
589
  # Create chunk from sentences
590
+ start_idx = i
591
+ end_idx = i + dynamic_chunk_size - 1
592
+ chunk_sentences = sentences[start_idx:end_idx + 1]
593
  chunk_text = ' '.join(chunk_sentences)
594
+
595
  # Only analyze chunks that meet minimum length requirement
596
  if len(chunk_text.strip()) >= min_chunk_length:
597
  chunks.append(chunk_text)
598
+ chunk_sentence_ranges.append((start_idx, end_idx))
599
+
600
  # Analyze this chunk
601
  prob, label = self.predict_single_text_xgboost(chunk_text)
602
  chunk_predictions.append((prob, label))
603
  chunk_probabilities.append(prob)
604
+
605
  logger.info(f" Chunk {len(chunks)}: {chunk_text[:60]}... → {'AI' if label == 1 else 'Human'} ({prob:.3f})")
606
+
607
+
608
 
609
  if len(chunk_predictions) < 2:
610
  return {
 
614
  'reason': 'Too few chunks for mixed analysis',
615
  'overall_probability': overall_prob,
616
  'modified_probability': overall_prob,
617
+ 'chunk_probabilities': chunk_probabilities,
618
+ 'raw_chunks': [],
619
+ 'sentence_analysis': [],
620
+ 'merged_spans': [],
621
  'chunk_analysis': chunk_predictions
622
  }
623
 
 
629
  # Mixed text detection logic
630
  is_mixed = human_chunks > 0 and ai_chunks > 0
631
  mixed_ratio = min(human_chunks, ai_chunks) / total_chunks
632
+ chunk_avg_prob = float(np.mean(chunk_probabilities)) if chunk_probabilities else overall_prob
633
+ chunk_label = 'AI' if chunk_avg_prob >= 0.5 else 'Human'
634
 
635
  logger.info(f"\nChunk Analysis Summary:")
636
  logger.info(f" Total chunks analyzed: {total_chunks}")
637
  logger.info(f" Human chunks: {human_chunks}")
638
  logger.info(f" AI chunks: {ai_chunks}")
639
  logger.info(f" Mixed ratio: {mixed_ratio:.2f}")
640
+ logger.info(f" Average chunk probability: {chunk_avg_prob:.3f}")
641
+ logger.info(f" Chunk-derived label: {chunk_label}")
642
 
643
+ if is_mixed:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
644
  final_prediction = 'Mixed'
645
+ modified_prob = chunk_avg_prob
646
+ confidence = 1.0 - mixed_ratio
647
+ logger.info(" → MIXED TEXT DETECTED (chunk-based)")
 
 
 
 
 
648
  else:
649
+ final_prediction = chunk_label
650
+ modified_prob = chunk_avg_prob
651
+ confidence = abs(chunk_avg_prob - 0.5) * 2
652
+ logger.info(f" → Pure {chunk_label} text based on chunk probabilities")
653
+
654
+ # Build detailed raw_chunks with character offsets
655
+ raw_chunks: List[Dict[str, Any]] = []
656
+ for idx, ((prob, label), (sent_start, sent_end)) in enumerate(zip(chunk_predictions, chunk_sentence_ranges)):
657
+ # Map sentence indices to char offsets
658
+ start_char = sentence_offsets[sent_start][0] if sent_start < len(sentence_offsets) else 0
659
+ end_char = sentence_offsets[sent_end][1] if sent_end < len(sentence_offsets) else len(text)
660
+ chunk_text = text[start_char:end_char]
661
+ raw_chunks.append({
662
+ 'chunk_index': idx,
663
+ 'start_char': start_char,
664
+ 'end_char': end_char,
665
+ 'text': chunk_text,
666
+ 'probability': float(prob),
667
+ 'label': 'ai' if label == 1 else 'human',
668
+ 'sentence_range': [sent_start, sent_end]
669
+ })
670
+
671
+ # Compute per-sentence aggregated probabilities and labels (weighted by chunk presence)
672
+ sentence_analysis: List[Dict[str, Any]] = []
673
+ for si in range(len(sentences)):
674
+ # Find chunks covering this sentence
675
+ covering_probs: List[float] = []
676
+ covering_labels: List[int] = []
677
+ for (prob, label), (cs, ce) in zip(chunk_predictions, chunk_sentence_ranges):
678
+ if cs <= si <= ce:
679
+ covering_probs.append(prob)
680
+ covering_labels.append(label)
681
+ if covering_probs:
682
+ avg_prob = float(np.mean(covering_probs))
683
+ # Use weighted/average probability as primary signal, but also
684
+ # consider chunk label majority with a safety threshold.
685
+ # Tighten AI labeling by requiring a higher probability threshold
686
+ # to reduce false positives from noisy chunks.
687
+ label_frac = float(np.mean(covering_labels)) if covering_labels else 0.0
688
+ AI_PROB_THRESHOLD = 0.55
689
+ # If average probability is confidently AI, mark as AI.
690
+ if avg_prob >= AI_PROB_THRESHOLD:
691
+ sentence_label = 'ai'
692
+ # Otherwise, if majority of covering chunks are labeled AI and
693
+ # probability is at least 0.5, mark as AI (minority case).
694
+ elif label_frac > 0.5 and avg_prob >= 0.5:
695
+ sentence_label = 'ai'
696
+ else:
697
+ sentence_label = 'human'
698
  else:
699
+ # No covering chunks: use nearest-chunk fallback (prefer previous chunk,
700
+ # otherwise next chunk). This avoids falling back to the global overall_prob
701
+ # which can make trailing sentences inherit the global label.
702
+ nearest_prob = None
703
+ nearest_label = None
704
+ # find previous chunk index (the last chunk that ends before this sentence)
705
+ prev_idx = None
706
+ for idx, (cs, ce) in enumerate(chunk_sentence_ranges):
707
+ if ce < si:
708
+ prev_idx = idx
709
+ if prev_idx is not None:
710
+ nearest_prob, nearest_label = chunk_predictions[prev_idx]
711
+ else:
712
+ # find next chunk index (the first chunk that starts after this sentence)
713
+ next_idx = None
714
+ for idx, (cs, ce) in enumerate(chunk_sentence_ranges):
715
+ if cs > si:
716
+ next_idx = idx
717
+ break
718
+ if next_idx is not None:
719
+ nearest_prob, nearest_label = chunk_predictions[next_idx]
720
+
721
+ if nearest_prob is not None:
722
+ avg_prob = float(nearest_prob)
723
+ sentence_label = 'ai' if nearest_label == 1 else 'human'
724
+ else:
725
+ # Fallback to overall prediction if there are truly no chunks
726
+ avg_prob = overall_prob
727
+ sentence_label = 'ai' if overall_label == 1 else 'human'
728
+
729
+ start_c, end_c = sentence_offsets[si] if si < len(sentence_offsets) else (0, 0)
730
+ sentence_analysis.append({
731
+ 'sentence_index': si,
732
+ 'start_char': start_c,
733
+ 'end_char': end_c,
734
+ 'text': sentences[si],
735
+ 'avg_probability': avg_prob,
736
+ 'label': sentence_label
737
+ })
738
+
739
+ # Merge adjacent sentences with same label into non-overlapping spans for easy frontend rendering
740
+ merged_spans: List[Dict[str, Any]] = []
741
+ if sentence_analysis:
742
+ cur = sentence_analysis[0]
743
+ cur_start = cur['start_char']
744
+ cur_end = cur['end_char']
745
+ cur_label = cur['label']
746
+ cur_probs = [cur['avg_probability']]
747
+
748
+ for s in sentence_analysis[1:]:
749
+ if s['label'] == cur_label:
750
+ # extend current span
751
+ cur_end = s['end_char']
752
+ cur_probs.append(s['avg_probability'])
753
+ else:
754
+ merged_spans.append({
755
+ 'start_char': cur_start,
756
+ 'end_char': cur_end,
757
+ 'label': cur_label,
758
+ 'avg_probability': float(np.mean(cur_probs))
759
+ })
760
+ # start a new span
761
+ cur_start = s['start_char']
762
+ cur_end = s['end_char']
763
+ cur_label = s['label']
764
+ cur_probs = [s['avg_probability']]
765
+
766
+ # append final span
767
+ merged_spans.append({
768
+ 'start_char': cur_start,
769
+ 'end_char': cur_end,
770
+ 'label': cur_label,
771
+ 'avg_probability': float(np.mean(cur_probs))
772
+ })
773
+
774
  return {
775
  'prediction': final_prediction,
776
  'confidence': confidence,
 
783
  'modified_probability': modified_prob,
784
  'chunk_probabilities': chunk_probabilities,
785
  'chunk_analysis': chunk_predictions,
786
+ 'raw_chunks': raw_chunks,
787
+ 'sentence_analysis': sentence_analysis,
788
+ 'merged_spans': merged_spans,
789
  'chunk_size': chunk_size,
790
  'overlap': overlap
791
  }
792
+
793
+
794
 
795
  def detect_ai(self, text: str) -> Dict[str, Any]:
796
  """
 
838
 
839
  elif prediction == "AI":
840
  explanation = f"This text is classified as AI-Generated with {certainty}."
841
+ explanation += " The text is typically associated with AI-generated writing based on patterns, including uniform structure or predictable phrasing."
842
  if pol_var <= 0.10:
843
+ explanation += " Very low emotional variation which is common in more structured or machine-generated texts."
844
  elif pol_var <= 0.35:
845
+ explanation += " Low emotional variation which may align with AI patterns but can also occur in formal human writing."
846
  elif pol_var <= 0.60:
847
+ explanation += " Moderate emotional variation which is less typical for AI but still possible depending on the prompt or model."
848
  else:
849
+ explanation += " High emotional variation which is uncommon in AI outputs but may occur in certain complex or narrative prompts."
850
  else: # Human
851
  explanation = f"This text is classified as Human-Authored with {certainty}."
852
+ explanation += " The text shows patterns frequently observed in human writing, such as natural variations and flexible sentence structures."
853
  if pol_var > 0.60:
854
+ explanation += " High emotional variation which often reflects expressive or opinionated writing."
855
  elif pol_var >= 0.36:
856
+ explanation += " Moderate emotional variation which shows natural shifts in tone."
857
  elif pol_var >= 0.11:
858
+ explanation += " Low emotional variation which may indicate formal or academic writing."
859
  else:
860
  explanation += " Very low emotional variation indicates consistent tone with focused perspective."
861
 
 
873
  "avg_polarity": avg_pol,
874
  "polarity_variance": pol_var
875
  },
876
+
877
  "mixed_analysis": {
878
  "is_mixed": chunk_result["is_mixed"],
879
  "mixed_ratio": chunk_result.get("mixed_ratio", 0),
 
882
  "total_chunks": chunk_result.get("total_chunks", 0),
883
  "overall_probability": chunk_result["overall_probability"],
884
  "modified_probability": chunk_result["modified_probability"]
885
+ },
886
+ "raw_chunks": chunk_result.get("raw_chunks", []),
887
+ "sentence_analysis": chunk_result.get("sentence_analysis", []),
888
+ "merged_spans": chunk_result.get("merged_spans", []),
889
+ "modelProcessingTime": time.perf_counter()
890
  }
891
+
892
 
893
  def analyze_text(self, text: str) -> Dict[str, Any]:
894
+ start_time = time.perf_counter()
895
  """
896
  Comprehensive text analysis combining AI detection with sentiment features
897
 
 
901
  Returns:
902
  Complete analysis results with model-based sentiment features
903
  """
904
+ # Validate input text length (80-2000 words)
905
  total_words = len(text.split())
906
  if total_words < 80:
907
  raise ValueError(f"Text too short for analysis ({total_words} words, minimum 80 words required)")
908
+ elif total_words > 2000:
909
+ raise ValueError(f"Text too long for analysis ({total_words} words, maximum 2000 words allowed)")
910
 
911
  # Get AI detection results (includes sentiment features from model)
912
  ai_detection = self.detect_ai(text)
913
+ mixed_analysis = ai_detection.get("mixed_analysis") or {}
914
+ modified_prob = mixed_analysis.get("modified_probability")
915
+ overall_prob = mixed_analysis.get("overall_probability")
916
+
917
+ primary_probability = None
918
+ for candidate in (modified_prob, overall_prob, ai_detection.get("probability")):
919
+ if isinstance(candidate, (int, float)):
920
+ primary_probability = float(candidate)
921
+ break
922
+
923
+ if primary_probability is None:
924
+ primary_probability = 0.0
925
+
926
+ ai_prob = max(0.0, min(1.0, primary_probability))
927
+ human_prob = 1.0 - ai_prob
928
+ probability_breakdown = {
929
+ "ai": ai_prob,
930
+ "human": human_prob
931
+ }
932
  model_sentiment = ai_detection.get("sentiment_features", {})
933
 
934
  # Perform basic text analysis
 
944
  # Determine complexity based on AI probability and text metrics
945
  is_ai = ai_detection["classification"] == "ai"
946
  is_mixed = ai_detection["classification"] == "mixed"
 
947
 
948
  # Handle different prediction types
949
  if is_mixed:
 
966
  insights = []
967
 
968
  if is_mixed and ai_detection["confidence"] > 60:
 
969
  insights.append({
970
  "type": "observation",
971
  "title": "Mixed Content Detected",
 
982
  insights.append({
983
  "type": "observation",
984
  "title": "AI-Generated Content Detected",
985
+ "description": f"This text shows strong indicators associated with AI-generated writing ({ai_detection['confidence']:.1f}% confidence).",
986
+ "suggestion": "Consider adding personal insights, varied sentence structures, or unique perspectives to achieve a more unique voice."
987
  })
988
  elif not is_ai and ai_detection["confidence"] > 75:
989
  insights.append({
990
  "type": "strength",
991
  "title": "Human Writing Characteristics",
992
+ "description": f"The text shows several features commonly found in human-authored writing ({ai_detection['confidence']:.1f}% confidence)."
993
  })
994
 
995
  # Sentence variety analysis
 
1000
  insights.append({
1001
  "type": "improvement",
1002
  "title": "Sentence Variety",
1003
+ "description": "Sentences have similar lengths, which this pattern may indicate AI generation.",
1004
+ "suggestion": "Consider varying sentence length to create a more natural flow."
1005
  })
1006
  else:
1007
  insights.append({
 
1026
  # Construct full analysis response with model sentiment features
1027
  polarity_variance = model_sentiment.get("polarity_variance", 0)
1028
 
1029
+ end_time = time.perf_counter()
1030
+ processing_seconds = round(end_time - start_time, 3) # exact seconds (millisecond precision)
1031
+
1032
+ logger.info(f"Model processing time for analyze_text: {processing_seconds:.3f}s")
1033
+
1034
  return {
1035
  "advancedSentiment": {
1036
  "emotions": emotions,
 
1069
  },
1070
  "aiOrHuman": ai_detection["classification"],
1071
  "aiOrHumanConfidence": ai_detection["confidence"],
1072
+ "aiOrHumanExplanation": ai_detection["explanation"],
1073
+ "mixedAnalysis": mixed_analysis,
1074
+ "probabilityBreakdown": probability_breakdown,
1075
+ "rawChunks": ai_detection.get("raw_chunks", []),
1076
+ "sentenceAnalysis": ai_detection.get("sentence_analysis", []),
1077
+ "mergedSpans": ai_detection.get("merged_spans", []),
1078
+ "modelProcessingTime": processing_seconds
1079
  }
1080
 
1081
  def get_model_info(self) -> Dict[str, Any]:
 
1096
  ],
1097
  "description": "Two-branch model for detecting AI-Generated vs Human-Authored text using DeBERTa semantic embeddings combined with sentiment features"
1098
  }
1099
+
1100
+ #