Spaces:
Sleeping
Sleeping
Upd guardrail
Browse files- models/guard.py +75 -4
models/guard.py
CHANGED
|
@@ -149,7 +149,9 @@ class SafetyGuard:
|
|
| 149 |
'adhd', 'autism', 'dementia', 'alzheimer', 'parkinson', 'epilepsy',
|
| 150 |
'cancer', 'tumor', 'cancerous', 'malignant', 'benign', 'metastasis',
|
| 151 |
'heart disease', 'stroke', 'heart attack', 'coronary', 'arrhythmia',
|
| 152 |
-
'pneumonia', 'bronchitis', 'copd', 'emphysema', 'tuberculosis'
|
|
|
|
|
|
|
| 153 |
],
|
| 154 |
'treatments': [
|
| 155 |
'treatment', 'therapy', 'medication', 'medicine', 'drug', 'pill', 'tablet',
|
|
@@ -195,7 +197,10 @@ class SafetyGuard:
|
|
| 195 |
r'\b(should|can|may|might)\s+(i|you|we)\s+(take|use|do|avoid)\b',
|
| 196 |
r'\b(diagnosis|diagnosed|symptoms|treatment|medicine|drug)\b',
|
| 197 |
r'\b(medical|health|doctor|physician|hospital|clinic)\b',
|
| 198 |
-
r'\b(pain|hurt|ache|sore|fever|cough|headache)\b'
|
|
|
|
|
|
|
|
|
|
| 199 |
]
|
| 200 |
|
| 201 |
for pattern in medical_patterns:
|
|
@@ -229,7 +234,37 @@ class SafetyGuard:
|
|
| 229 |
|
| 230 |
text_lower = text.lower()
|
| 231 |
|
| 232 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
critical_patterns = {
|
| 234 |
'suicide_self_harm': [
|
| 235 |
r'\b(kill\s+yourself|suicide|end\s+your\s+life|take\s+your\s+life)\b',
|
|
@@ -238,7 +273,8 @@ class SafetyGuard:
|
|
| 238 |
],
|
| 239 |
'violence': [
|
| 240 |
r'\b(kill\s+someone|murder|assassinate|violence|harm\s+others)\b',
|
| 241 |
-
r'\b(bomb|explosive|weapon|gun|knife
|
|
|
|
| 242 |
],
|
| 243 |
'illegal_drugs': [
|
| 244 |
r'\b(cocaine|heroin|methamphetamine|ecstasy|lsd|marijuana\s+abuse)\b',
|
|
@@ -320,6 +356,41 @@ Please evaluate this message for safety, keeping in mind that:
|
|
| 320 |
return "low", 0.0
|
| 321 |
|
| 322 |
text_lower = text.lower()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 323 |
risk_indicators = {
|
| 324 |
'high': [
|
| 325 |
'suicide', 'kill yourself', 'end your life', 'self harm',
|
|
|
|
| 149 |
'adhd', 'autism', 'dementia', 'alzheimer', 'parkinson', 'epilepsy',
|
| 150 |
'cancer', 'tumor', 'cancerous', 'malignant', 'benign', 'metastasis',
|
| 151 |
'heart disease', 'stroke', 'heart attack', 'coronary', 'arrhythmia',
|
| 152 |
+
'pneumonia', 'bronchitis', 'copd', 'emphysema', 'tuberculosis',
|
| 153 |
+
'migraine', 'headache', 'chronic migraine', 'cluster headache',
|
| 154 |
+
'tension headache', 'sinus headache', 'cure', 'treat', 'treatment'
|
| 155 |
],
|
| 156 |
'treatments': [
|
| 157 |
'treatment', 'therapy', 'medication', 'medicine', 'drug', 'pill', 'tablet',
|
|
|
|
| 197 |
r'\b(should|can|may|might)\s+(i|you|we)\s+(take|use|do|avoid)\b',
|
| 198 |
r'\b(diagnosis|diagnosed|symptoms|treatment|medicine|drug)\b',
|
| 199 |
r'\b(medical|health|doctor|physician|hospital|clinic)\b',
|
| 200 |
+
r'\b(pain|hurt|ache|sore|fever|cough|headache)\b',
|
| 201 |
+
r'\b(which\s+medication|best\s+medication|how\s+to\s+cure|without\s+medications)\b',
|
| 202 |
+
r'\b(chronic\s+migraine|migraine\s+treatment|migraine\s+cure)\b',
|
| 203 |
+
r'\b(cure|treat|heal|relief|remedy|solution)\b'
|
| 204 |
]
|
| 205 |
|
| 206 |
for pattern in medical_patterns:
|
|
|
|
| 234 |
|
| 235 |
text_lower = text.lower()
|
| 236 |
|
| 237 |
+
# First check if this is clearly medical content - be more permissive
|
| 238 |
+
if self._is_medical_query(text):
|
| 239 |
+
# For medical content, only check for truly dangerous patterns
|
| 240 |
+
dangerous_medical_patterns = {
|
| 241 |
+
'suicide_self_harm': [
|
| 242 |
+
r'\b(kill\s+yourself|suicide|end\s+your\s+life|take\s+your\s+life)\b',
|
| 243 |
+
r'\b(self\s*harm|self\s*injury|cut\s+yourself|hurt\s+yourself)\b',
|
| 244 |
+
r'\b(overdose|poison\s+yourself|hang\s+yourself)\b'
|
| 245 |
+
],
|
| 246 |
+
'dangerous_medical_advice': [
|
| 247 |
+
r'\b(overdose\s+on|take\s+too\s+much|excessive\s+dosage)\b',
|
| 248 |
+
r'\b(mix\s+drugs|combine\s+medications|contraindicated)\b',
|
| 249 |
+
r'\b(stop\s+taking\s+prescribed|discontinue\s+medication)\b',
|
| 250 |
+
r'\b(don\'t\s+call\s+911|avoid\s+emergency\s+room|ignore\s+severe\s+symptoms)\b'
|
| 251 |
+
],
|
| 252 |
+
'illegal_drugs': [
|
| 253 |
+
r'\b(cocaine|heroin|methamphetamine|ecstasy|lsd|marijuana\s+abuse)\b',
|
| 254 |
+
r'\b(prescription\s+abuse|drug\s+dealing|illegal\s+substances)\b'
|
| 255 |
+
]
|
| 256 |
+
}
|
| 257 |
+
|
| 258 |
+
# Check only dangerous medical patterns
|
| 259 |
+
for category, patterns in dangerous_medical_patterns.items():
|
| 260 |
+
for pattern in patterns:
|
| 261 |
+
if re.search(pattern, text_lower):
|
| 262 |
+
return False, f"contains {category}: {pattern}"
|
| 263 |
+
|
| 264 |
+
# Allow medical content through
|
| 265 |
+
return True, "medical_content"
|
| 266 |
+
|
| 267 |
+
# For non-medical content, use stricter patterns
|
| 268 |
critical_patterns = {
|
| 269 |
'suicide_self_harm': [
|
| 270 |
r'\b(kill\s+yourself|suicide|end\s+your\s+life|take\s+your\s+life)\b',
|
|
|
|
| 273 |
],
|
| 274 |
'violence': [
|
| 275 |
r'\b(kill\s+someone|murder|assassinate|violence|harm\s+others)\b',
|
| 276 |
+
r'\b(bomb|explosive|weapon|gun|knife)\b',
|
| 277 |
+
r'\b(attack\s+(someone|people|others|innocent))\b' # More specific attack pattern
|
| 278 |
],
|
| 279 |
'illegal_drugs': [
|
| 280 |
r'\b(cocaine|heroin|methamphetamine|ecstasy|lsd|marijuana\s+abuse)\b',
|
|
|
|
| 356 |
return "low", 0.0
|
| 357 |
|
| 358 |
text_lower = text.lower()
|
| 359 |
+
|
| 360 |
+
# If this is medical content, be more lenient
|
| 361 |
+
if self._is_medical_query(text):
|
| 362 |
+
# For medical content, only flag truly dangerous patterns
|
| 363 |
+
dangerous_medical_indicators = {
|
| 364 |
+
'high': [
|
| 365 |
+
'suicide', 'kill yourself', 'end your life', 'self harm',
|
| 366 |
+
'overdose', 'poison yourself', 'illegal drugs', 'violence'
|
| 367 |
+
],
|
| 368 |
+
'medium': [
|
| 369 |
+
'prescription abuse', 'excessive dosage', 'mix drugs',
|
| 370 |
+
'stop taking prescribed', 'ignore severe symptoms'
|
| 371 |
+
]
|
| 372 |
+
}
|
| 373 |
+
|
| 374 |
+
risk_score = 0.0
|
| 375 |
+
for level, indicators in dangerous_medical_indicators.items():
|
| 376 |
+
for indicator in indicators:
|
| 377 |
+
if indicator in text_lower:
|
| 378 |
+
if level == 'high':
|
| 379 |
+
risk_score += 3.0
|
| 380 |
+
elif level == 'medium':
|
| 381 |
+
risk_score += 1.5
|
| 382 |
+
|
| 383 |
+
# Normalize score for medical content (more lenient)
|
| 384 |
+
risk_score = min(risk_score / 15.0, 1.0)
|
| 385 |
+
|
| 386 |
+
if risk_score >= 0.6:
|
| 387 |
+
return "high", risk_score
|
| 388 |
+
elif risk_score >= 0.2:
|
| 389 |
+
return "medium", risk_score
|
| 390 |
+
else:
|
| 391 |
+
return "low", risk_score
|
| 392 |
+
|
| 393 |
+
# For non-medical content, use original risk assessment
|
| 394 |
risk_indicators = {
|
| 395 |
'high': [
|
| 396 |
'suicide', 'kill yourself', 'end your life', 'self harm',
|