URI/FQDN extraction

This commit is contained in:
Dominik Chilla 2018-12-15 02:01:07 +01:00
parent a50740d60f
commit 70550c4021
9 changed files with 243 additions and 16 deletions

View File

@ -178,3 +178,27 @@ class Attachment:
if 'href' in at_ref: if 'href' in at_ref:
self.href = at_ref['href'] self.href = at_ref['href']
class URIException(Exception):
message = None
def __init__(self,message):
self.message = message
class URI:
id = None
uri = None
fqdn = None
href = None
def __init__(self,uri_ref):
if 'id' not in uri_ref:
raise URIException("'id' is mandatory!")
self.id = uri_ref['id']
if 'uri' not in uri_ref:
raise URIException("'uri' is mandatory!")
self.uri = uri_ref['uri']
if 'fqdn' not in uri_ref:
raise URIException("'fqdn' is mandatory!")
self.fqdn = uri_ref['fqdn']
if 'href' in uri_ref:
self.href = uri_ref['href']

View File

@ -1,8 +1,8 @@
import json,sys,os,logging import json,sys,os,logging,re
import email,email.header,email.message import email,email.header,email.message
from GulagDB import GulagDB,GulagDBException from GulagDB import GulagDB,GulagDBException
from GulagMailbox import IMAPmailbox,IMAPmailboxException from GulagMailbox import IMAPmailbox,IMAPmailboxException
from GulagUtils import whoami from GulagUtils import whoami,extract_uris,extract_fqdn
class GulagException(Exception): class GulagException(Exception):
message = None message = None
@ -78,6 +78,7 @@ class Gulag:
for unseen in imap_mb.get_unseen_messages(): for unseen in imap_mb.get_unseen_messages():
quarmail_ids = [] quarmail_ids = []
attachments = [] attachments = []
uris = {}
uid = unseen['imap_uid'] uid = unseen['imap_uid']
msg = email.message_from_bytes(unseen['msg']) msg = email.message_from_bytes(unseen['msg'])
msg_size = len(msg) msg_size = len(msg)
@ -154,12 +155,30 @@ class Gulag:
}) })
attachments.append(attach_id) attachments.append(attach_id)
# Ende if part.get_filename() # Ende if part.get_filename()
# get all URIs
ctype = part.get_content_type()
if(ctype == 'text/plain' or ctype == 'text/html'):
curis = {}
curis = extract_uris(part.get_payload(decode=True).decode("utf-8"))
if(len(curis) > 0):
uris = {**uris, **curis}
# Ende for msg.walk() # Ende for msg.walk()
# QuarMail und Attachments verknüpfen # QuarMail und Attachments verknüpfen
if(len(attachments) > 0): if(len(attachments) > 0):
for quarmail_id in quarmail_ids: for quarmail_id in quarmail_ids:
for attachment_id in attachments: for attachment_id in attachments:
self.db.quarmail2attachment(str(quarmail_id), str(attachment_id)) self.db.quarmail2attachment(str(quarmail_id), str(attachment_id))
if(len(uris) > 0):
for quarmail_id in quarmail_ids:
for uri in uris:
try:
uri_id = self.db.add_uri({
"uri": uri,
"fqdn": extract_fqdn(uri)
})
self.db.quarmail2uri(str(quarmail_id), str(uri_id))
except GulagDBException as e:
logging.error(whoami(self) + e.message)
# Ende for(unseen) # Ende for(unseen)
imap_mb.close() imap_mb.close()
# Ende for get_mailboxes # Ende for get_mailboxes
@ -256,9 +275,37 @@ class Gulag:
if 'data' not in args: if 'data' not in args:
return at_db return at_db
def get_uris(self): def get_quarmail_uris(self,args):
# https://stackoverflow.com/questions/1792366/extract-urls-out-of-email-in-python if('from_rfc822_message' not in args):
return True try:
return self.db.get_quarmail_uris(args['quarmail_id'])
except GulagDBException as e:
raise GulagException(whoami(self) + e.message) from e
qm_db = None
try:
qm_db = self.db.get_quarmail({"id": args['quarmail_id']})
except GulagDBException as e:
logging.warning(whoami(self) + e.message)
raise GulagException(whoami(self) + e.message) from e
mailbox = None
try:
mailbox = self.db.get_mailbox(qm_db['mailbox_id'])
except GulagDBException as e:
logging.warning(whoami(self) + e.message)
raise GulagException(whoami(self) + e.message) from e
imap_mb = None
try:
imap_mb = IMAPmailbox(mailbox)
mparts = imap_mb.get_main_parts(qm_db['imap_uid'])
uris = []
uri_pattern = r'(https?:\/\/[^\s<>"]+)'
for part in mparts:
for m in re.finditer(uri_pattern, part.decode("utf-8")):
uris.append(m.group(0))
return uris
except IMAPmailboxException as e:
logging.warning(whoami(self) + e.message)
raise GulagException(whoami(self) + e.message) from e
def rspamd_http2imap(self,args): def rspamd_http2imap(self,args):
mailbox = None mailbox = None
@ -302,7 +349,7 @@ class Gulag:
) )
logging.error(err) logging.error(err)
raise GulagException(err) raise GulagException(err)
if('rfc822_message' not in args['rfc822_message']): if('rfc822_message' not in args):
err = str(whoami(self) err = str(whoami(self)
+ "Missing rfc822_message!" + "Missing rfc822_message!"
) )

View File

@ -2,7 +2,7 @@ import mysql.connector as mariadb
from Entities import( from Entities import(
Mailbox,MailboxException,QuarMail, Mailbox,MailboxException,QuarMail,
QuarMailException,Attachment, QuarMailException,Attachment,
AttachmentException AttachmentException,URI,URIException
) )
from GulagUtils import whoami from GulagUtils import whoami
@ -99,6 +99,11 @@ class GulagDB:
cnt += 1 cnt += 1
return where_clause return where_clause
def parse_filters(self,filters):
# TODO
# {"groupOp":"AND","rules":[{"field":"Customer","op":"eq","data":"eosp"}]}
return True
def get_mailboxes(self): def get_mailboxes(self):
try: try:
cursor = self.conn.cursor() cursor = self.conn.cursor()
@ -217,7 +222,7 @@ class GulagDB:
data = cursor.fetchall() data = cursor.fetchall()
if not data: if not data:
raise GulagDBException(whoami(self) raise GulagDBException(whoami(self)
+ "Quarmail with id '"+ args['id'] + "' does not exist!" + "Quarmail with id '"+ str(args['id']) + "' does not exist!"
) )
desc = cursor.description desc = cursor.description
cursor.close() cursor.close()
@ -380,3 +385,61 @@ class GulagDB:
except mariadb.Error as e: except mariadb.Error as e:
raise GulagDBException(whoami(self) + str(e)) from e raise GulagDBException(whoami(self) + str(e)) from e
def add_uri(self,args):
try:
cursor = self.conn.cursor()
cursor.execute("insert into URIs " +
"(uri, fqdn) values (%s,%s)",
(args['uri'], args['fqdn'])
)
return cursor.lastrowid
except mariadb.Error as e:
raise GulagDBException(whoami(self) + str(e)) from e
def del_uri(self,uri_id):
try:
cursor = self.conn.cursor()
cursor.execute(
"delete from URIs where uri_id=" + uri_id + ";"
)
return cursor.lastrowid
except mariadb.Error as e:
raise GulagDBException(whoami(self) + str(e)) from e
def quarmail2uri(self,quarmail_id,uri_id):
try:
cursor = self.conn.cursor()
cursor.execute("insert into QuarMail2URI " +
"(quarmail_id, uri_id) values (%s,%s)",
(quarmail_id, uri_id)
)
except mariadb.Error as e:
raise GulagDBException(whoami(self) + str(e)) from e
def get_quarmail_uris(self,quarmail_id):
try:
query = "select URIs.*"
query += " from QuarMail2URI"
query += " left join QuarMails ON QuarMails.id = QuarMail2URI.quarmail_id"
query += " left join URIs ON URIs.id = QuarMail2URI.uri_id"
query += " where QuarMails.id = " + str(quarmail_id) + ";"
cursor = self.conn.cursor()
cursor.execute(query)
results = []
data = cursor.fetchall()
if not data:
raise GulagDBException(whoami(self)
+ "QuarMail("+ str(quarmail_id) +") has no uris!"
)
desc = cursor.description
for tuple in data:
dict = {}
for (name, value) in zip(desc, tuple):
dict[name[0]] = value
dict['href'] = self.uri_prefixes['quarmails'] + str(quarmail_id)
dict['href'] += "/uris/" + str(dict['id'])
results.append(URI(dict).__dict__)
return results
except mariadb.Error as e:
raise GulagDBException(whoami(self) + str(e)) from e

View File

@ -91,6 +91,19 @@ class IMAPmailbox:
+ str(self.email_address) + " not found!" + str(self.email_address) + " not found!"
) )
def get_main_parts(self,imap_uid):
msg = email.message_from_bytes(self.get_message(imap_uid))
mparts = []
for part in msg.walk():
ctype = part.get_content_type()
if(ctype == 'text/plain' or ctype == 'text/html'):
mparts.append(part.get_payload(decode=True))
if(len(mparts) > 0):
return mparts
raise IMAPmailboxException(whoami(self) +
"IMAP_UID(" + str(imap_uid)+")@"+str(self.email_address)+" has no main parts!"
)
def append_message(self,message): def append_message(self,message):
rv, data = self.mailbox.append( rv, data = self.mailbox.append(
self.imap_mailbox, self.imap_mailbox,

View File

@ -1,4 +1,4 @@
import sys import sys,re
from smtplib import SMTP from smtplib import SMTP
def whoami(obj): def whoami(obj):
@ -20,3 +20,17 @@ def send_mail(args):
except TimeoutError as e: except TimeoutError as e:
raise Exception('xyz') from e raise Exception('xyz') from e
def extract_uris(string):
uris = {}
uri_pattern = r'(https?:\/\/[^\s<>"]+)'
for m in re.finditer(uri_pattern, string):
uris[m.group(0)] = {}
return uris
def extract_fqdn(uri):
uri_pattern = r'(https?:\/\/[^\s<>"]+)'
if(re.match(uri_pattern,uri)):
m = re.match(r'https?:\/\/([^:\/]+)', uri)
return m.group(1)
else:
return None

View File

@ -82,6 +82,18 @@ class ResQuarMailAttachment(GulagResource):
except GulagException as e: except GulagException as e:
abort(400, message=e.message) abort(400, message=e.message)
class ResQuarMailURIs(GulagResource):
def get(self,quarmail_id):
args = {
"quarmail_id": quarmail_id
}
if(request.args.get('from_rfc822_message')):
args['from_rfc822_message'] = True
try:
return self.gulag.get_quarmail_uris(args)
except GulagException as e:
abort(400, message=e.message)
class ResAttachments(GulagResource): class ResAttachments(GulagResource):
def get(self): def get(self):
return {"resource": "Attachments"} return {"resource": "Attachments"}

View File

@ -7,7 +7,7 @@ from Gulag import Gulag,GulagException
from Resources import (ResRoot,ResMailboxes, from Resources import (ResRoot,ResMailboxes,
ResQuarMails,ResQuarMail,ResQuarMailAttachments, ResQuarMails,ResQuarMail,ResQuarMailAttachments,
ResQuarMailAttachment,ResAttachments,ResAttachment, ResQuarMailAttachment,ResAttachments,ResAttachment,
ResRSPAMDImporter ResRSPAMDImporter,ResQuarMailURIs
) )
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('--config', required=True, help="Path to config file") parser.add_argument('--config', required=True, help="Path to config file")
@ -44,6 +44,10 @@ try:
'/api/v1/quarmails/<int:quarmail_id>/attachments/<int:attachment_id>', '/api/v1/quarmails/<int:quarmail_id>/attachments/<int:attachment_id>',
resource_class_kwargs={'gulag_object': gulag} resource_class_kwargs={'gulag_object': gulag}
) )
api.add_resource(ResQuarMailURIs,
'/api/v1/quarmails/<int:quarmail_id>/uris',
resource_class_kwargs={'gulag_object': gulag}
)
api.add_resource(ResAttachments, api.add_resource(ResAttachments,
'/api/v1/attachments', '/api/v1/attachments',
resource_class_kwargs={'gulag_object': gulag} resource_class_kwargs={'gulag_object': gulag}

View File

@ -20,10 +20,6 @@ create table Mailboxes(
smtp_pass varchar(2048) default null, smtp_pass varchar(2048) default null,
comment varchar(256) default null comment varchar(256) default null
)ENGINE = InnoDB; )ENGINE = InnoDB;
insert into Mailboxes (email_address,name,imap_user,imap_pass)
values('quarantine-in@example.org','E-Mail inbound quarantine','quarantine-in','quarantine-in_secure_password');
insert into Mailboxes (email_address,name,imap_user,imap_pass)
values('quarantine-out@example.org','E-Mail outbound quarantine','quarantine-out','quarantine-out_secure_password');
insert into Mailboxes (email_address,name,imap_user,imap_pass) insert into Mailboxes (email_address,name,imap_user,imap_pass)
values('quarantine-sandbox@example.org','E-Mail sandbox quarantine','quarantine-sb','quarantine-sb_secure_password'); values('quarantine-sandbox@example.org','E-Mail sandbox quarantine','quarantine-sb','quarantine-sb_secure_password');
@ -60,3 +56,15 @@ create table QuarMail2Attachment (
foreign key (attachment_id) references Attachments (id) on delete cascade on update cascade foreign key (attachment_id) references Attachments (id) on delete cascade on update cascade
)ENGINE = InnoDB; )ENGINE = InnoDB;
create table URIs (
id int unsigned auto_increment primary key,
uri varchar(2048),
fqdn varchar(512)
)ENGINE = InnoDB;
create table QuarMail2URI (
quarmail_id int unsigned,
uri_id int unsigned,
foreign key (quarmail_id) references QuarMails (id) on delete cascade on update cascade,
foreign key (uri_id) references URIs (id) on delete cascade on update cascade
)ENGINE = InnoDB;

View File

@ -1,7 +1,7 @@
swagger: '2.0' swagger: '2.0'
info: info:
description: Gulag quarantine REST API description: Gulag quarantine REST API
version: '18.12' version: "1.0.0"
title: Gulag quarantine REST API title: Gulag quarantine REST API
contact: contact:
email: info@dc-it-con.de email: info@dc-it-con.de
@ -208,6 +208,35 @@ paths:
500: 500:
description: server error description: server error
/quarmails/{quarmail_id}/uris:
get:
summary: "retrieves all URIS from any main MIME part (text/plain,text/html)"
operationId: get_quarmail_uris
produces:
- application/json
parameters:
- in: path
name: quarmail_id
description: unique id of quarantined email
required: true
type: string
- in: query
name: from_rfc822_message
description: fetch all URIs from RFC822 message not from database
required: false
type: string
responses:
200:
description: array of URIs
schema:
type: array
items:
$ref: '#/definitions/URI'
400:
description: bad input parameter
500:
description: server error
definitions: definitions:
QuarMail: QuarMail:
type: object type: object
@ -321,3 +350,16 @@ definitions:
data: data:
type: string type: string
description: raw/encoded (see content_encoding) attachment payload description: raw/encoded (see content_encoding) attachment payload
URI:
type: object
required:
- id
- uri
- fqdn
properties:
id:
type: integer
uri:
type: string
fqdn:
type: string