diff --git a/src/api/v1/endpoints/anomaly.py b/src/api/v1/endpoints/anomaly.py new file mode 100644 index 0000000..626a4da --- /dev/null +++ b/src/api/v1/endpoints/anomaly.py @@ -0,0 +1,90 @@ +""" +Generic Anomaly Detection API Endpoints +""" + +from fastapi import APIRouter, HTTPException, Header +from typing import Optional +from pydantic import BaseModel + +from ..dependencies import get_api_key +from ..services.anomaly_service import get_anomaly_service + +router = APIRouter() + + +class AnomalyRequest(BaseModel): + data: list[dict] + value_field: str = "value" + + +@router.post("/detect") +async def detect_anomalies( + request: AnomalyRequest, + x_api_key: str = Header(None), +): + """ + Generic anomaly detection endpoint + Works with any numerical data - salaries, sensors, metrics, etc. + """ + api_key = get_api_key(x_api_key) + + service = get_anomaly_service() + + values = [float(r.get(request.value_field, 0)) for r in request.data] + + if not values: + return { + "error": f"Field '{request.value_field}' not found in data", + "data_sample": request.data[0] if request.data else None, + "available_fields": list(request.data[0].keys()) if request.data else [], + } + + zscore_results = service.detect_zscore(values, threshold=2.5) + iqr_results = service.detect_iqr(values, multiplier=1.5) + + anomalies = [] + for i in range(len(values)): + votes = sum( + [ + zscore_results[i].is_anomaly if zscore_results else False, + iqr_results[i].is_anomaly if iqr_results else False, + ] + ) + + if votes >= 1: + anomalies.append( + { + "index": i, + "record": request.data[i], + "value": values[i], + "confidence": votes / 2, + "methods": { + "zscore": zscore_results[i].is_anomaly + if zscore_results + else False, + "iqr": iqr_results[i].is_anomaly if iqr_results else False, + }, + "zscore_score": zscore_results[i].score if zscore_results else 0, + "iqr_details": iqr_results[i].details if iqr_results else {}, + } + ) + + return { + "detected": len(anomalies) > 0, + "total_records": len(request.data), + "anomalies_found": len(anomalies), + "anomaly_rate": len(anomalies) / len(request.data) if request.data else 0, + "anomalies": anomalies, + "summary": { + "mean": float(sum(values) / len(values)), + "median": sorted(values)[len(values) // 2] if values else 0, + "std": service.detect_zscore(values, threshold=0) + and ( + sum((x - sum(values) / len(values)) ** 2 for x in values) / len(values) + ) + ** 0.5 + or 0, + "min": min(values) if values else 0, + "max": max(values) if values else 0, + }, + } diff --git a/src/main.py b/src/main.py index 27089a4..1a452fe 100644 --- a/src/main.py +++ b/src/main.py @@ -5,7 +5,7 @@ from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse from fastapi.staticfiles import StaticFiles -from .api.v1.endpoints import image, scoring, speech, video +from .api.v1.endpoints import image, scoring, speech, video, anomaly from .core.config import settings from .core.logging import get_logger from .services.image_service import get_image_service @@ -51,6 +51,7 @@ app.include_router(image.router, prefix=settings.api_v1_prefix) app.include_router(video.router, prefix=settings.api_v1_prefix) app.include_router(speech.router, prefix=settings.api_v1_prefix) app.include_router(scoring.router, prefix=settings.api_v1_prefix) +app.include_router(anomaly.router, prefix=settings.api_v1_prefix) app.mount("/outputs", StaticFiles(directory="outputs"), name="outputs") @@ -69,6 +70,7 @@ async def root(): "speech": "/api/speech", "vision": "/api/vision", "scoring": "/api/scoring", + "anomaly": "/api/anomaly", }, } ) diff --git a/src/services/anomaly_service.py b/src/services/anomaly_service.py new file mode 100644 index 0000000..dcb56bb --- /dev/null +++ b/src/services/anomaly_service.py @@ -0,0 +1,268 @@ +""" +Anomaly Detection Service - Detecção de desvios/anomalias em dados tabulares +Compatible with salary data, sensor readings, and other numerical time series +""" + +import numpy as np +from typing import Optional +from dataclasses import dataclass +from .core.logging import get_logger + +logger = get_logger("anomaly_service") + + +@dataclass +class AnomalyResult: + is_anomaly: bool + score: float + method: str + threshold: float + details: dict + + +class AnomalyDetectionService: + def __init__(self): + self.initialized = True + + def detect_zscore( + self, data: list[float], threshold: float = 3.0 + ) -> list[AnomalyResult]: + """ + Z-Score based anomaly detection + Identifies values that are more than N standard deviations from mean + """ + if len(data) < 3: + return [] + + arr = np.array(data) + mean = np.mean(arr) + std = np.std(arr) + + if std == 0: + return [] + + z_scores = np.abs((arr - mean) / std) + results = [] + + for i, z in enumerate(z_scores): + is_anomaly = z > threshold + results.append( + AnomalyResult( + is_anomaly=is_anomaly, + score=float(z), + method="zscore", + threshold=threshold, + details={ + "index": i, + "value": float(arr[i]), + "mean": float(mean), + "std": float(std), + "deviation": float(arr[i] - mean), + }, + ) + ) + + return results + + def detect_iqr( + self, data: list[float], multiplier: float = 1.5 + ) -> list[AnomalyResult]: + """ + IQR (Interquartile Range) based anomaly detection + Identifies values outside Q1 - 1.5*IQR and Q3 + 1.5*IQR + """ + if len(data) < 4: + return [] + + arr = np.array(data) + q1 = np.percentile(arr, 25) + q3 = np.percentile(arr, 75) + iqr = q3 - q1 + + lower_bound = q1 - multiplier * iqr + upper_bound = q3 + multiplier * iqr + + results = [] + for i, val in enumerate(arr): + is_anomaly = val < lower_bound or val > upper_bound + distance = 0 + if val < lower_bound: + distance = lower_bound - val + elif val > upper_bound: + distance = val - upper_bound + + results.append( + AnomalyResult( + is_anomaly=is_anomaly, + score=float(distance), + method="iqr", + threshold=multiplier, + details={ + "index": i, + "value": float(val), + "q1": float(q1), + "q3": float(q3), + "iqr": float(iqr), + "lower_bound": float(lower_bound), + "upper_bound": float(upper_bound), + }, + ) + ) + + return results + + def detect_isolation_forest( + self, data: list[float], contamination: float = 0.1 + ) -> list[AnomalyResult]: + """ + Simplified Isolation Forest-like detection using average absolute deviation + More robust to outliers than Z-score + """ + if len(data) < 3: + return [] + + arr = np.array(data) + median = np.median(arr) + mad = np.median(np.abs(arr - median)) + + if mad == 0: + return [] + + modified_z = np.abs(0.6745 * (arr - median) / mad) + threshold = 3.5 + + results = [] + for i, z in enumerate(modified_z): + is_anomaly = z > threshold + results.append( + AnomalyResult( + is_anomaly=is_anomaly, + score=float(z), + method="isolation_forest", + threshold=threshold, + details={ + "index": i, + "value": float(arr[i]), + "median": float(median), + "mad": float(mad), + }, + ) + ) + + return results + + def detect_salary_anomalies( + self, records: list[dict], value_field: str = "salarioBase" + ) -> dict: + """ + Specialized detection for salary/payroll data + Combines multiple methods for robust anomaly detection + """ + values = [float(r.get(value_field, 0)) for r in records if value_field in r] + + if not values: + return {"error": f"Field '{value_field}' not found in records"} + + zscore_results = self.detect_zscore(values, threshold=2.5) + iqr_results = self.detect_iqr(values, multiplier=1.5) + iso_results = self.detect_isolation_forest(values) + + anomalies = [] + for i in range(len(values)): + votes = sum( + [ + zscore_results[i].is_anomaly if zscore_results else False, + iqr_results[i].is_anomaly if iqr_results else False, + iso_results[i].is_anomaly if iso_results else False, + ] + ) + + if votes >= 2: + anomalies.append( + { + "index": i, + "record": records[i], + "value": values[i], + "confidence": votes / 3, + "methods": { + "zscore": zscore_results[i].is_anomaly + if zscore_results + else False, + "iqr": iqr_results[i].is_anomaly if iqr_results else False, + "isolation": iso_results[i].is_anomaly + if iso_results + else False, + }, + "zscore_details": zscore_results[i].details + if zscore_results + else {}, + } + ) + + return { + "total_records": len(records), + "anomalies_found": len(anomalies), + "anomaly_rate": len(anomalies) / len(records) if records else 0, + "anomalies": anomalies, + "summary": { + "mean": float(np.mean(values)), + "median": float(np.median(values)), + "std": float(np.std(values)), + "min": float(np.min(values)), + "max": float(np.max(values)), + }, + } + + def detect_sensor_anomalies( + self, readings: list[dict], value_field: str = "value" + ) -> dict: + """ + Detection for sensor/IoT data with time-series characteristics + """ + values = [float(r.get(value_field, 0)) for r in readings if value_field in r] + + if not values: + return {"error": f"Field '{value_field}' not found in readings"} + + arr = np.array(values) + diff = np.diff(arr) + mean_diff = np.mean(np.abs(diff)) + std_diff = np.std(diff) + + anomalies = [] + for i in range(1, len(values)): + change = abs(values[i] - values[i - 1]) + z_change = (change - mean_diff) / std_diff if std_diff > 0 else 0 + + if z_change > 2.5 or change > 3 * mean_diff: + anomalies.append( + { + "index": i, + "record": readings[i], + "previous_value": values[i - 1], + "current_value": values[i], + "change": change, + "change_zscore": float(z_change), + "type": "sudden_change", + } + ) + + return { + "total_readings": len(readings), + "anomalies_found": len(anomalies), + "anomalies": anomalies, + "baseline": { + "mean_change": float(mean_diff), + "std_change": float(std_diff), + }, + } + + +_service: Optional[AnomalyDetectionService] = None + + +def get_anomaly_service() -> AnomalyDetectionService: + global _service + if _service is None: + _service = AnomalyDetectionService() + return _service