From 70550c40213c839725f51c12f2185f51e960c735 Mon Sep 17 00:00:00 2001 From: Dominik Chilla Date: Sat, 15 Dec 2018 02:01:07 +0100 Subject: [PATCH] URI/FQDN extraction --- app/Entities.py | 24 +++++++++++++++ app/Gulag.py | 61 +++++++++++++++++++++++++++++++++----- app/GulagDB.py | 67 ++++++++++++++++++++++++++++++++++++++++-- app/GulagMailbox.py | 13 ++++++++ app/GulagUtils.py | 16 +++++++++- app/Resources.py | 12 ++++++++ app/gulag_server.py | 6 +++- db/gulag.sql | 16 +++++++--- gulag-openapi-2.0.yaml | 44 ++++++++++++++++++++++++++- 9 files changed, 243 insertions(+), 16 deletions(-) diff --git a/app/Entities.py b/app/Entities.py index c4a4323..fd6c40a 100644 --- a/app/Entities.py +++ b/app/Entities.py @@ -178,3 +178,27 @@ class Attachment: if 'href' in at_ref: self.href = at_ref['href'] +class URIException(Exception): + message = None + def __init__(self,message): + self.message = message + +class URI: + id = None + uri = None + fqdn = None + href = None + + def __init__(self,uri_ref): + if 'id' not in uri_ref: + raise URIException("'id' is mandatory!") + self.id = uri_ref['id'] + if 'uri' not in uri_ref: + raise URIException("'uri' is mandatory!") + self.uri = uri_ref['uri'] + if 'fqdn' not in uri_ref: + raise URIException("'fqdn' is mandatory!") + self.fqdn = uri_ref['fqdn'] + if 'href' in uri_ref: + self.href = uri_ref['href'] + diff --git a/app/Gulag.py b/app/Gulag.py index c1cd43a..177e62f 100644 --- a/app/Gulag.py +++ b/app/Gulag.py @@ -1,8 +1,8 @@ -import json,sys,os,logging +import json,sys,os,logging,re import email,email.header,email.message from GulagDB import GulagDB,GulagDBException from GulagMailbox import IMAPmailbox,IMAPmailboxException -from GulagUtils import whoami +from GulagUtils import whoami,extract_uris,extract_fqdn class GulagException(Exception): message = None @@ -78,6 +78,7 @@ class Gulag: for unseen in imap_mb.get_unseen_messages(): quarmail_ids = [] attachments = [] + uris = {} uid = unseen['imap_uid'] msg = email.message_from_bytes(unseen['msg']) msg_size = len(msg) @@ -154,12 +155,30 @@ class Gulag: }) attachments.append(attach_id) # Ende if part.get_filename() + # get all URIs + ctype = part.get_content_type() + if(ctype == 'text/plain' or ctype == 'text/html'): + curis = {} + curis = extract_uris(part.get_payload(decode=True).decode("utf-8")) + if(len(curis) > 0): + uris = {**uris, **curis} # Ende for msg.walk() # QuarMail und Attachments verknüpfen if(len(attachments) > 0): for quarmail_id in quarmail_ids: for attachment_id in attachments: self.db.quarmail2attachment(str(quarmail_id), str(attachment_id)) + if(len(uris) > 0): + for quarmail_id in quarmail_ids: + for uri in uris: + try: + uri_id = self.db.add_uri({ + "uri": uri, + "fqdn": extract_fqdn(uri) + }) + self.db.quarmail2uri(str(quarmail_id), str(uri_id)) + except GulagDBException as e: + logging.error(whoami(self) + e.message) # Ende for(unseen) imap_mb.close() # Ende for get_mailboxes @@ -256,10 +275,38 @@ class Gulag: if 'data' not in args: return at_db - def get_uris(self): - # https://stackoverflow.com/questions/1792366/extract-urls-out-of-email-in-python - return True - + def get_quarmail_uris(self,args): + if('from_rfc822_message' not in args): + try: + return self.db.get_quarmail_uris(args['quarmail_id']) + except GulagDBException as e: + raise GulagException(whoami(self) + e.message) from e + qm_db = None + try: + qm_db = self.db.get_quarmail({"id": args['quarmail_id']}) + except GulagDBException as e: + logging.warning(whoami(self) + e.message) + raise GulagException(whoami(self) + e.message) from e + mailbox = None + try: + mailbox = self.db.get_mailbox(qm_db['mailbox_id']) + except GulagDBException as e: + logging.warning(whoami(self) + e.message) + raise GulagException(whoami(self) + e.message) from e + imap_mb = None + try: + imap_mb = IMAPmailbox(mailbox) + mparts = imap_mb.get_main_parts(qm_db['imap_uid']) + uris = [] + uri_pattern = r'(https?:\/\/[^\s<>"]+)' + for part in mparts: + for m in re.finditer(uri_pattern, part.decode("utf-8")): + uris.append(m.group(0)) + return uris + except IMAPmailboxException as e: + logging.warning(whoami(self) + e.message) + raise GulagException(whoami(self) + e.message) from e + def rspamd_http2imap(self,args): mailbox = None try: @@ -302,7 +349,7 @@ class Gulag: ) logging.error(err) raise GulagException(err) - if('rfc822_message' not in args['rfc822_message']): + if('rfc822_message' not in args): err = str(whoami(self) + "Missing rfc822_message!" ) diff --git a/app/GulagDB.py b/app/GulagDB.py index fd0b0b4..29a4a62 100644 --- a/app/GulagDB.py +++ b/app/GulagDB.py @@ -2,7 +2,7 @@ import mysql.connector as mariadb from Entities import( Mailbox,MailboxException,QuarMail, QuarMailException,Attachment, - AttachmentException + AttachmentException,URI,URIException ) from GulagUtils import whoami @@ -99,6 +99,11 @@ class GulagDB: cnt += 1 return where_clause + def parse_filters(self,filters): + # TODO + # {"groupOp":"AND","rules":[{"field":"Customer","op":"eq","data":"eosp"}]} + return True + def get_mailboxes(self): try: cursor = self.conn.cursor() @@ -217,7 +222,7 @@ class GulagDB: data = cursor.fetchall() if not data: raise GulagDBException(whoami(self) - + "Quarmail with id '"+ args['id'] + "' does not exist!" + + "Quarmail with id '"+ str(args['id']) + "' does not exist!" ) desc = cursor.description cursor.close() @@ -380,3 +385,61 @@ class GulagDB: except mariadb.Error as e: raise GulagDBException(whoami(self) + str(e)) from e + def add_uri(self,args): + try: + cursor = self.conn.cursor() + cursor.execute("insert into URIs " + + "(uri, fqdn) values (%s,%s)", + (args['uri'], args['fqdn']) + ) + return cursor.lastrowid + except mariadb.Error as e: + raise GulagDBException(whoami(self) + str(e)) from e + + def del_uri(self,uri_id): + try: + cursor = self.conn.cursor() + cursor.execute( + "delete from URIs where uri_id=" + uri_id + ";" + ) + return cursor.lastrowid + except mariadb.Error as e: + raise GulagDBException(whoami(self) + str(e)) from e + + + def quarmail2uri(self,quarmail_id,uri_id): + try: + cursor = self.conn.cursor() + cursor.execute("insert into QuarMail2URI " + + "(quarmail_id, uri_id) values (%s,%s)", + (quarmail_id, uri_id) + ) + except mariadb.Error as e: + raise GulagDBException(whoami(self) + str(e)) from e + + def get_quarmail_uris(self,quarmail_id): + try: + query = "select URIs.*" + query += " from QuarMail2URI" + query += " left join QuarMails ON QuarMails.id = QuarMail2URI.quarmail_id" + query += " left join URIs ON URIs.id = QuarMail2URI.uri_id" + query += " where QuarMails.id = " + str(quarmail_id) + ";" + cursor = self.conn.cursor() + cursor.execute(query) + results = [] + data = cursor.fetchall() + if not data: + raise GulagDBException(whoami(self) + + "QuarMail("+ str(quarmail_id) +") has no uris!" + ) + desc = cursor.description + for tuple in data: + dict = {} + for (name, value) in zip(desc, tuple): + dict[name[0]] = value + dict['href'] = self.uri_prefixes['quarmails'] + str(quarmail_id) + dict['href'] += "/uris/" + str(dict['id']) + results.append(URI(dict).__dict__) + return results + except mariadb.Error as e: + raise GulagDBException(whoami(self) + str(e)) from e diff --git a/app/GulagMailbox.py b/app/GulagMailbox.py index 89edf25..427397e 100644 --- a/app/GulagMailbox.py +++ b/app/GulagMailbox.py @@ -91,6 +91,19 @@ class IMAPmailbox: + str(self.email_address) + " not found!" ) + def get_main_parts(self,imap_uid): + msg = email.message_from_bytes(self.get_message(imap_uid)) + mparts = [] + for part in msg.walk(): + ctype = part.get_content_type() + if(ctype == 'text/plain' or ctype == 'text/html'): + mparts.append(part.get_payload(decode=True)) + if(len(mparts) > 0): + return mparts + raise IMAPmailboxException(whoami(self) + + "IMAP_UID(" + str(imap_uid)+")@"+str(self.email_address)+" has no main parts!" + ) + def append_message(self,message): rv, data = self.mailbox.append( self.imap_mailbox, diff --git a/app/GulagUtils.py b/app/GulagUtils.py index cc9e05c..31d0661 100644 --- a/app/GulagUtils.py +++ b/app/GulagUtils.py @@ -1,4 +1,4 @@ -import sys +import sys,re from smtplib import SMTP def whoami(obj): @@ -20,3 +20,17 @@ def send_mail(args): except TimeoutError as e: raise Exception('xyz') from e +def extract_uris(string): + uris = {} + uri_pattern = r'(https?:\/\/[^\s<>"]+)' + for m in re.finditer(uri_pattern, string): + uris[m.group(0)] = {} + return uris + +def extract_fqdn(uri): + uri_pattern = r'(https?:\/\/[^\s<>"]+)' + if(re.match(uri_pattern,uri)): + m = re.match(r'https?:\/\/([^:\/]+)', uri) + return m.group(1) + else: + return None diff --git a/app/Resources.py b/app/Resources.py index ba5c8e8..a40a275 100644 --- a/app/Resources.py +++ b/app/Resources.py @@ -82,6 +82,18 @@ class ResQuarMailAttachment(GulagResource): except GulagException as e: abort(400, message=e.message) +class ResQuarMailURIs(GulagResource): + def get(self,quarmail_id): + args = { + "quarmail_id": quarmail_id + } + if(request.args.get('from_rfc822_message')): + args['from_rfc822_message'] = True + try: + return self.gulag.get_quarmail_uris(args) + except GulagException as e: + abort(400, message=e.message) + class ResAttachments(GulagResource): def get(self): return {"resource": "Attachments"} diff --git a/app/gulag_server.py b/app/gulag_server.py index 014e7fa..3bc713e 100755 --- a/app/gulag_server.py +++ b/app/gulag_server.py @@ -7,7 +7,7 @@ from Gulag import Gulag,GulagException from Resources import (ResRoot,ResMailboxes, ResQuarMails,ResQuarMail,ResQuarMailAttachments, ResQuarMailAttachment,ResAttachments,ResAttachment, - ResRSPAMDImporter + ResRSPAMDImporter,ResQuarMailURIs ) parser = argparse.ArgumentParser() parser.add_argument('--config', required=True, help="Path to config file") @@ -44,6 +44,10 @@ try: '/api/v1/quarmails//attachments/', resource_class_kwargs={'gulag_object': gulag} ) + api.add_resource(ResQuarMailURIs, + '/api/v1/quarmails//uris', + resource_class_kwargs={'gulag_object': gulag} + ) api.add_resource(ResAttachments, '/api/v1/attachments', resource_class_kwargs={'gulag_object': gulag} diff --git a/db/gulag.sql b/db/gulag.sql index e776293..e9d34a2 100644 --- a/db/gulag.sql +++ b/db/gulag.sql @@ -20,10 +20,6 @@ create table Mailboxes( smtp_pass varchar(2048) default null, comment varchar(256) default null )ENGINE = InnoDB; -insert into Mailboxes (email_address,name,imap_user,imap_pass) - values('quarantine-in@example.org','E-Mail inbound quarantine','quarantine-in','quarantine-in_secure_password'); -insert into Mailboxes (email_address,name,imap_user,imap_pass) - values('quarantine-out@example.org','E-Mail outbound quarantine','quarantine-out','quarantine-out_secure_password'); insert into Mailboxes (email_address,name,imap_user,imap_pass) values('quarantine-sandbox@example.org','E-Mail sandbox quarantine','quarantine-sb','quarantine-sb_secure_password'); @@ -60,3 +56,15 @@ create table QuarMail2Attachment ( foreign key (attachment_id) references Attachments (id) on delete cascade on update cascade )ENGINE = InnoDB; +create table URIs ( + id int unsigned auto_increment primary key, + uri varchar(2048), + fqdn varchar(512) +)ENGINE = InnoDB; + +create table QuarMail2URI ( + quarmail_id int unsigned, + uri_id int unsigned, + foreign key (quarmail_id) references QuarMails (id) on delete cascade on update cascade, + foreign key (uri_id) references URIs (id) on delete cascade on update cascade +)ENGINE = InnoDB; diff --git a/gulag-openapi-2.0.yaml b/gulag-openapi-2.0.yaml index 073e8b6..1b602ea 100644 --- a/gulag-openapi-2.0.yaml +++ b/gulag-openapi-2.0.yaml @@ -1,7 +1,7 @@ swagger: '2.0' info: description: Gulag quarantine REST API - version: '18.12' + version: "1.0.0" title: Gulag quarantine REST API contact: email: info@dc-it-con.de @@ -207,6 +207,35 @@ paths: description: bad input parameter 500: description: server error + + /quarmails/{quarmail_id}/uris: + get: + summary: "retrieves all URIS from any main MIME part (text/plain,text/html)" + operationId: get_quarmail_uris + produces: + - application/json + parameters: + - in: path + name: quarmail_id + description: unique id of quarantined email + required: true + type: string + - in: query + name: from_rfc822_message + description: fetch all URIs from RFC822 message not from database + required: false + type: string + responses: + 200: + description: array of URIs + schema: + type: array + items: + $ref: '#/definitions/URI' + 400: + description: bad input parameter + 500: + description: server error definitions: QuarMail: @@ -321,3 +350,16 @@ definitions: data: type: string description: raw/encoded (see content_encoding) attachment payload + URI: + type: object + required: + - id + - uri + - fqdn + properties: + id: + type: integer + uri: + type: string + fqdn: + type: string