BinKhoaLe1812 commited on
Commit
4364204
·
verified ·
1 Parent(s): 952af56

Upd guardrail

Browse files
Files changed (1) hide show
  1. models/guard.py +75 -4
models/guard.py CHANGED
@@ -149,7 +149,9 @@ class SafetyGuard:
149
  'adhd', 'autism', 'dementia', 'alzheimer', 'parkinson', 'epilepsy',
150
  'cancer', 'tumor', 'cancerous', 'malignant', 'benign', 'metastasis',
151
  'heart disease', 'stroke', 'heart attack', 'coronary', 'arrhythmia',
152
- 'pneumonia', 'bronchitis', 'copd', 'emphysema', 'tuberculosis'
 
 
153
  ],
154
  'treatments': [
155
  'treatment', 'therapy', 'medication', 'medicine', 'drug', 'pill', 'tablet',
@@ -195,7 +197,10 @@ class SafetyGuard:
195
  r'\b(should|can|may|might)\s+(i|you|we)\s+(take|use|do|avoid)\b',
196
  r'\b(diagnosis|diagnosed|symptoms|treatment|medicine|drug)\b',
197
  r'\b(medical|health|doctor|physician|hospital|clinic)\b',
198
- r'\b(pain|hurt|ache|sore|fever|cough|headache)\b'
 
 
 
199
  ]
200
 
201
  for pattern in medical_patterns:
@@ -229,7 +234,37 @@ class SafetyGuard:
229
 
230
  text_lower = text.lower()
231
 
232
- # Critical harmful patterns (immediate block)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
  critical_patterns = {
234
  'suicide_self_harm': [
235
  r'\b(kill\s+yourself|suicide|end\s+your\s+life|take\s+your\s+life)\b',
@@ -238,7 +273,8 @@ class SafetyGuard:
238
  ],
239
  'violence': [
240
  r'\b(kill\s+someone|murder|assassinate|violence|harm\s+others)\b',
241
- r'\b(bomb|explosive|weapon|gun|knife|attack)\b'
 
242
  ],
243
  'illegal_drugs': [
244
  r'\b(cocaine|heroin|methamphetamine|ecstasy|lsd|marijuana\s+abuse)\b',
@@ -320,6 +356,41 @@ Please evaluate this message for safety, keeping in mind that:
320
  return "low", 0.0
321
 
322
  text_lower = text.lower()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
323
  risk_indicators = {
324
  'high': [
325
  'suicide', 'kill yourself', 'end your life', 'self harm',
 
149
  'adhd', 'autism', 'dementia', 'alzheimer', 'parkinson', 'epilepsy',
150
  'cancer', 'tumor', 'cancerous', 'malignant', 'benign', 'metastasis',
151
  'heart disease', 'stroke', 'heart attack', 'coronary', 'arrhythmia',
152
+ 'pneumonia', 'bronchitis', 'copd', 'emphysema', 'tuberculosis',
153
+ 'migraine', 'headache', 'chronic migraine', 'cluster headache',
154
+ 'tension headache', 'sinus headache', 'cure', 'treat', 'treatment'
155
  ],
156
  'treatments': [
157
  'treatment', 'therapy', 'medication', 'medicine', 'drug', 'pill', 'tablet',
 
197
  r'\b(should|can|may|might)\s+(i|you|we)\s+(take|use|do|avoid)\b',
198
  r'\b(diagnosis|diagnosed|symptoms|treatment|medicine|drug)\b',
199
  r'\b(medical|health|doctor|physician|hospital|clinic)\b',
200
+ r'\b(pain|hurt|ache|sore|fever|cough|headache)\b',
201
+ r'\b(which\s+medication|best\s+medication|how\s+to\s+cure|without\s+medications)\b',
202
+ r'\b(chronic\s+migraine|migraine\s+treatment|migraine\s+cure)\b',
203
+ r'\b(cure|treat|heal|relief|remedy|solution)\b'
204
  ]
205
 
206
  for pattern in medical_patterns:
 
234
 
235
  text_lower = text.lower()
236
 
237
+ # First check if this is clearly medical content - be more permissive
238
+ if self._is_medical_query(text):
239
+ # For medical content, only check for truly dangerous patterns
240
+ dangerous_medical_patterns = {
241
+ 'suicide_self_harm': [
242
+ r'\b(kill\s+yourself|suicide|end\s+your\s+life|take\s+your\s+life)\b',
243
+ r'\b(self\s*harm|self\s*injury|cut\s+yourself|hurt\s+yourself)\b',
244
+ r'\b(overdose|poison\s+yourself|hang\s+yourself)\b'
245
+ ],
246
+ 'dangerous_medical_advice': [
247
+ r'\b(overdose\s+on|take\s+too\s+much|excessive\s+dosage)\b',
248
+ r'\b(mix\s+drugs|combine\s+medications|contraindicated)\b',
249
+ r'\b(stop\s+taking\s+prescribed|discontinue\s+medication)\b',
250
+ r'\b(don\'t\s+call\s+911|avoid\s+emergency\s+room|ignore\s+severe\s+symptoms)\b'
251
+ ],
252
+ 'illegal_drugs': [
253
+ r'\b(cocaine|heroin|methamphetamine|ecstasy|lsd|marijuana\s+abuse)\b',
254
+ r'\b(prescription\s+abuse|drug\s+dealing|illegal\s+substances)\b'
255
+ ]
256
+ }
257
+
258
+ # Check only dangerous medical patterns
259
+ for category, patterns in dangerous_medical_patterns.items():
260
+ for pattern in patterns:
261
+ if re.search(pattern, text_lower):
262
+ return False, f"contains {category}: {pattern}"
263
+
264
+ # Allow medical content through
265
+ return True, "medical_content"
266
+
267
+ # For non-medical content, use stricter patterns
268
  critical_patterns = {
269
  'suicide_self_harm': [
270
  r'\b(kill\s+yourself|suicide|end\s+your\s+life|take\s+your\s+life)\b',
 
273
  ],
274
  'violence': [
275
  r'\b(kill\s+someone|murder|assassinate|violence|harm\s+others)\b',
276
+ r'\b(bomb|explosive|weapon|gun|knife)\b',
277
+ r'\b(attack\s+(someone|people|others|innocent))\b' # More specific attack pattern
278
  ],
279
  'illegal_drugs': [
280
  r'\b(cocaine|heroin|methamphetamine|ecstasy|lsd|marijuana\s+abuse)\b',
 
356
  return "low", 0.0
357
 
358
  text_lower = text.lower()
359
+
360
+ # If this is medical content, be more lenient
361
+ if self._is_medical_query(text):
362
+ # For medical content, only flag truly dangerous patterns
363
+ dangerous_medical_indicators = {
364
+ 'high': [
365
+ 'suicide', 'kill yourself', 'end your life', 'self harm',
366
+ 'overdose', 'poison yourself', 'illegal drugs', 'violence'
367
+ ],
368
+ 'medium': [
369
+ 'prescription abuse', 'excessive dosage', 'mix drugs',
370
+ 'stop taking prescribed', 'ignore severe symptoms'
371
+ ]
372
+ }
373
+
374
+ risk_score = 0.0
375
+ for level, indicators in dangerous_medical_indicators.items():
376
+ for indicator in indicators:
377
+ if indicator in text_lower:
378
+ if level == 'high':
379
+ risk_score += 3.0
380
+ elif level == 'medium':
381
+ risk_score += 1.5
382
+
383
+ # Normalize score for medical content (more lenient)
384
+ risk_score = min(risk_score / 15.0, 1.0)
385
+
386
+ if risk_score >= 0.6:
387
+ return "high", risk_score
388
+ elif risk_score >= 0.2:
389
+ return "medium", risk_score
390
+ else:
391
+ return "low", risk_score
392
+
393
+ # For non-medical content, use original risk assessment
394
  risk_indicators = {
395
  'high': [
396
  'suicide', 'kill yourself', 'end your life', 'self harm',