-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdatabase_manager.py
More file actions
331 lines (302 loc) · 15.1 KB
/
database_manager.py
File metadata and controls
331 lines (302 loc) · 15.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
# ==============================================================================
# database_manager.py - CORREGIDO: Deduplicación mejorada y manejo preciso
# ==============================================================================
import json
from typing import Dict, List, Optional
import psycopg2
from psycopg2.extras import RealDictCursor
import psycopg2.pool
from contextlib import contextmanager
from config import Config
from utils import generate_message_id
class DatabaseManager:
def __init__(self, config: Config):
self.config = config
self.pool = self._create_pool()
self._init_tables()
def _create_pool(self):
"""Crea pool de conexiones"""
try:
return psycopg2.pool.SimpleConnectionPool(
1, 10,
host=self.config.POSTGRES_HOST,
port=self.config.POSTGRES_PORT,
database=self.config.POSTGRES_DB,
user=self.config.POSTGRES_USER,
password=self.config.POSTGRES_PASSWORD
)
except Exception as e:
print(f"❌ Error conectando a PostgreSQL: {e}")
return None
@contextmanager
def get_connection(self):
"""Context manager para conexiones"""
conn = None
try:
conn = self.pool.getconn()
yield conn
finally:
if conn:
self.pool.putconn(conn)
def _init_tables(self):
"""Inicializa las tablas necesarias"""
create_tables_sql = """
CREATE TABLE IF NOT EXISTS moderation_logs (
id SERIAL PRIMARY KEY,
message_id VARCHAR(255) UNIQUE,
username VARCHAR(100),
message_text TEXT,
timestamp TIMESTAMP,
file_source VARCHAR(255),
toxicity_score FLOAT,
spam_probability FLOAT,
sentiment VARCHAR(20),
categories TEXT[],
requires_action BOOLEAN,
action_type VARCHAR(50),
reasoning TEXT,
keywords_detected TEXT[],
analyzed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
model_used VARCHAR(100),
qdrant_point_id VARCHAR(100)
);
CREATE TABLE IF NOT EXISTS user_stats (
id SERIAL PRIMARY KEY,
username VARCHAR(100) UNIQUE,
total_messages INTEGER DEFAULT 0,
toxic_messages INTEGER DEFAULT 0,
spam_messages INTEGER DEFAULT 0,
avg_toxicity FLOAT DEFAULT 0.0,
avg_spam_prob FLOAT DEFAULT 0.0,
last_seen TIMESTAMP,
risk_level VARCHAR(20) DEFAULT 'low',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE INDEX IF NOT EXISTS idx_moderation_username ON moderation_logs(username);
CREATE INDEX IF NOT EXISTS idx_moderation_timestamp ON moderation_logs(timestamp);
CREATE INDEX IF NOT EXISTS idx_moderation_toxicity ON moderation_logs(toxicity_score);
CREATE INDEX IF NOT EXISTS idx_user_stats_risk ON user_stats(risk_level);
CREATE INDEX IF NOT EXISTS idx_moderation_message_id ON moderation_logs(message_id);
"""
try:
with self.get_connection() as conn:
with conn.cursor() as cur:
cur.execute(create_tables_sql)
conn.commit()
print("✅ Tablas de PostgreSQL inicializadas")
except Exception as e:
print(f"❌ Error inicializando tablas: {e}")
def get_existing_message_ids(self, messages: List[dict]) -> set:
"""🔧 CORREGIDO: Verificación más eficiente de duplicados"""
if not messages:
return set()
# Generar IDs usando la misma lógica que en el procesamiento
message_ids = []
for msg in messages:
message_id = generate_message_id(msg)
message_ids.append(message_id)
if not message_ids:
return set()
try:
with self.get_connection() as conn:
with conn.cursor() as cur:
# Usar batch query más eficiente
placeholders = ','.join(['%s'] * len(message_ids))
cur.execute(f"""
SELECT message_id FROM moderation_logs
WHERE message_id IN ({placeholders})
""", message_ids)
existing_ids = set(row[0] for row in cur.fetchall())
print(f"🔍 PostgreSQL: {len(existing_ids)} duplicados encontrados de {len(message_ids)} mensajes")
return existing_ids
except Exception as e:
print(f"❌ Error consultando IDs existentes en PostgreSQL: {e}")
return set()
def get_messages_by_file_source(self, file_source: str) -> List[Dict]:
try:
with self.get_connection() as conn:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT * FROM moderation_logs
WHERE file_source = %s
""", (file_source,))
return [dict(row) for row in cur.fetchall()]
except Exception as e:
print(f"❌ Error recuperando mensajes por archivo: {e}")
return []
def save_analysis(self, message: dict, analysis: dict, point_id: str):
"""🔧 CORREGIDO: Guardado más robusto con validación estricta"""
try:
message_id = analysis.get("message_id")
if not message_id:
print("❌ Error: message_id faltante en analysis")
return False
with self.get_connection() as conn:
with conn.cursor() as cur:
# Verificar si ya existe ANTES de insertar
cur.execute("SELECT COUNT(*) FROM moderation_logs WHERE message_id = %s", (message_id,))
exists = cur.fetchone()[0] > 0
if exists:
print(f"⚠️ Mensaje duplicado saltado en PostgreSQL (ID: {message_id})")
return False
# Insertar nuevo mensaje
cur.execute("""
INSERT INTO moderation_logs (
message_id, username, message_text, timestamp, file_source,
toxicity_score, spam_probability, sentiment,
categories, requires_action, action_type, reasoning,
keywords_detected, model_used, qdrant_point_id
)
VALUES (
%(message_id)s, %(username)s, %(text)s, %(timestamp)s, %(file_source)s,
%(toxicity_score)s, %(spam_probability)s, %(sentiment)s,
%(categories)s, %(requires_action)s, %(action_type)s, %(reasoning)s,
%(keywords)s, %(model_used)s, %(point_id)s
)
""", {
"message_id": message_id,
"username": message.get("username", "unknown"),
"text": message.get("text", ""),
"timestamp": message.get("timestamp"),
"file_source": message.get("file_source", "unknown"),
"toxicity_score": analysis.get("toxicity_score", 0.0),
"spam_probability": analysis.get("spam_probability", 0.0),
"sentiment": analysis.get("sentiment", "neutral"),
"categories": analysis.get("categories", []),
"requires_action": analysis.get("requires_action", False),
"action_type": analysis.get("action_type", "none"),
"reasoning": analysis.get("reasoning", ""),
"keywords": analysis.get("keywords_detected", []),
"model_used": analysis.get("model_used", "ollama"),
"point_id": point_id
})
# Actualizar estadísticas de usuario solo si se insertó
if cur.rowcount > 0:
username = message.get("username", "").strip().lower()
if not username or username in ["unknown", "none", "null"]:
username = "bot_or_unknown"
self._update_user_stats(cur, username, analysis)
print(f"📝 Nuevo mensaje guardado en PostgreSQL (ID: {message_id})")
conn.commit()
return True
else:
print(f"⚠️ No se pudo insertar mensaje (ID: {message_id})")
return False
except psycopg2.IntegrityError as e:
if "duplicate key" in str(e):
print(f"⚠️ Mensaje duplicado detectado por constraint (ID: {message_id})")
return False
else:
print(f"❌ Error de integridad guardando en PostgreSQL: {e}")
return False
except Exception as e:
print(f"❌ Error guardando en PostgreSQL: {e}")
return False
def get_analysis_by_message_id(self, message_id: str) -> Optional[Dict]:
"""🆕 Obtiene el análisis de un mensaje por su ID"""
try:
with self.get_connection() as conn:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT * FROM moderation_logs
WHERE message_id = %s
""", (message_id,))
result = cur.fetchone()
return dict(result) if result else None
except Exception as e:
print(f"❌ Error obteniendo análisis por message_id {message_id}: {e}")
return None
def get_recent_messages_by_user(self, username: str, limit: int = 10) -> List[Dict]:
"""🆕 Obtiene los mensajes más recientes de un usuario"""
try:
with self.get_connection() as conn:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT * FROM moderation_logs
WHERE username = %s
ORDER BY timestamp DESC
LIMIT %s
""", (username, limit))
return [dict(row) for row in cur.fetchall()]
except Exception as e:
print(f"❌ Error obteniendo mensajes recientes de {username}: {e}")
return []
def get_messages_in_timeframe(self, start_time, end_time) -> List[Dict]:
"""🆕 Obtiene mensajes en un rango de tiempo específico"""
try:
with self.get_connection() as conn:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT * FROM moderation_logs
WHERE timestamp BETWEEN %s AND %s
ORDER BY timestamp ASC
""", (start_time, end_time))
return [dict(row) for row in cur.fetchall()]
except Exception as e:
print(f"❌ Error obteniendo mensajes en rango temporal: {e}")
return []
def _update_user_stats(self, cursor, username: str, analysis: Dict):
"""Actualiza estadísticas del usuario"""
cursor.execute("""
INSERT INTO user_stats (username, total_messages, toxic_messages, spam_messages,
avg_toxicity, avg_spam_prob, last_seen)
VALUES (%s, 1, %s, %s, %s, %s, CURRENT_TIMESTAMP)
ON CONFLICT (username) DO UPDATE SET
total_messages = user_stats.total_messages + 1,
toxic_messages = user_stats.toxic_messages + %s,
spam_messages = user_stats.spam_messages + %s,
avg_toxicity = (user_stats.avg_toxicity * user_stats.total_messages + %s) / (user_stats.total_messages + 1),
avg_spam_prob = (user_stats.avg_spam_prob * user_stats.total_messages + %s) / (user_stats.total_messages + 1),
last_seen = CURRENT_TIMESTAMP,
updated_at = CURRENT_TIMESTAMP,
risk_level = CASE
WHEN (user_stats.avg_toxicity * user_stats.total_messages + %s) / (user_stats.total_messages + 1) > 0.7 THEN 'high'
WHEN (user_stats.avg_toxicity * user_stats.total_messages + %s) / (user_stats.total_messages + 1) > 0.4 THEN 'medium'
ELSE 'low'
END
""", (
username,
1 if analysis.get('toxicity_score', 0) > 0.5 else 0,
1 if analysis.get('spam_probability', 0) > 0.5 else 0,
analysis.get('toxicity_score', 0),
analysis.get('spam_probability', 0),
1 if analysis.get('toxicity_score', 0) > 0.5 else 0,
1 if analysis.get('spam_probability', 0) > 0.5 else 0,
analysis.get('toxicity_score', 0),
analysis.get('spam_probability', 0),
analysis.get('toxicity_score', 0),
analysis.get('toxicity_score', 0)
))
def get_user_risk_summary(self):
"""Obtiene resumen de usuarios de riesgo"""
try:
with self.get_connection() as conn:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT username, total_messages, avg_toxicity, avg_spam_prob,
risk_level, last_seen
FROM user_stats
WHERE risk_level IN ('medium', 'high')
ORDER BY avg_toxicity DESC, total_messages DESC
LIMIT 20
""")
return cur.fetchall()
except Exception as e:
print(f"❌ Error obteniendo resumen: {e}")
return []
def get_user_message_history(self, username: str, days_back: int = 7) -> List[Dict]:
"""🆕 Obtiene el historial completo de mensajes de un usuario"""
try:
with self.get_connection() as conn:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT * FROM moderation_logs
WHERE username = %s
AND timestamp >= NOW() - INTERVAL '%s days'
ORDER BY timestamp DESC
""", (username, days_back))
return [dict(row) for row in cur.fetchall()]
except Exception as e:
print(f"❌ Error obteniendo historial de {username}: {e}")
return []