From acda4c8e11abe20e7b5f9f9558b15b732213a3eb Mon Sep 17 00:00:00 2001 From: Dominik Chilla Date: Sun, 16 Dec 2018 23:35:10 +0100 Subject: [PATCH] attachment handling with magic and sub-URI parsing --- app/Entities.py | 7 +++++++ app/Gulag.py | 17 +++++++++++++++-- app/GulagDB.py | 15 +++++++++------ app/GulagUtils.py | 26 ++++++++++++++++---------- db/gulag.sql | 1 + docker/gulag-server/debian/Dockerfile | 12 +++++++----- gulag-openapi-2.0.yaml | 7 +++++++ 7 files changed, 62 insertions(+), 23 deletions(-) diff --git a/app/Entities.py b/app/Entities.py index fd6c40a..8de6e87 100644 --- a/app/Entities.py +++ b/app/Entities.py @@ -97,6 +97,7 @@ class QuarMail: msg_size = None href = None attach_count = None + uri_count = None def __init__(self,qm_ref): if 'id' not in qm_ref: @@ -139,6 +140,8 @@ class QuarMail: self.href = qm_ref['href'] if 'attach_count' in qm_ref: self.attach_count = qm_ref['attach_count'] + if 'uri_count' in qm_ref: + self.uri_count = qm_ref['uri_count'] class AttachmentException(Exception): message = None @@ -150,6 +153,7 @@ class Attachment: filename = None content_type = None content_encoding = None + magic = None comment = None mailbox_id = None imap_uid = None @@ -167,6 +171,9 @@ class Attachment: self.content_type = at_ref['content_type'] if 'content_encoding' in at_ref: self.content_encoding = at_ref['content_encoding'] + if 'magic' not in at_ref: + raise AttachmentException("'magic' is mandatory!") + self.magic = at_ref['magic'] if 'comment' in at_ref: self.comment = at_ref['comment'] if 'mailbox_id' not in at_ref: diff --git a/app/Gulag.py b/app/Gulag.py index 2e442cc..f497380 100644 --- a/app/Gulag.py +++ b/app/Gulag.py @@ -1,4 +1,4 @@ -import json,sys,os,logging,re +import json,sys,os,logging,re,magic import email,email.header,email.message from GulagDB import GulagDB,GulagDBException from GulagMailbox import IMAPmailbox,IMAPmailboxException @@ -30,6 +30,7 @@ class Gulag: raise GulagException(whoami(self) + "Logging not configured!") if('filename' in self.config['logging'] and len(self.config['logging']['filename']) > 0): + # TODO: Exception handling logging.basicConfig( filename=self.config['logging']['filename'], format='%(asctime)s %(levelname)s %(message)s', @@ -148,10 +149,16 @@ class Gulag: else: # filename isn“t encoded filename = filename[0][0] + attach_magic = None + try: + attach_magic = magic.from_buffer(part.get_payload(decode=True)) + except: + logging.info(whoami(self) + ": " + str(sys.exc_info())) attach_id = self.db.add_attachment({ 'filename': filename, 'content_type': part.get_content_type(), - 'content_encoding': part['Content-Transfer-Encoding'] + 'content_encoding': part['Content-Transfer-Encoding'], + 'magic': attach_magic }) attachments.append(attach_id) # End if part.get_filename() @@ -168,6 +175,9 @@ class Gulag: for quarmail_id in quarmail_ids: for attachment_id in attachments: self.db.quarmail2attachment(str(quarmail_id), str(attachment_id)) + logging.info(whoami(self) + + "Attachment("+str(attachment_id)+")@QuarMail("+str(quarmail_id)+") imported" + ) # link message with uris if(len(uris) > 0): for quarmail_id in quarmail_ids: @@ -178,6 +188,9 @@ class Gulag: "fqdn": extract_fqdn(uri) }) self.db.quarmail2uri(str(quarmail_id), str(uri_id)) + logging.info(whoami(self) + + "URI("+str(uri_id)+")@QuarMail("+str(quarmail_id)+") imported" + ) except GulagDBException as e: logging.error(whoami(self) + e.message) # End for(unseen) diff --git a/app/GulagDB.py b/app/GulagDB.py index 29a4a62..5e96606 100644 --- a/app/GulagDB.py +++ b/app/GulagDB.py @@ -185,7 +185,9 @@ class GulagDB: try: cursor = self.conn.cursor() query = "select *,(select count(*) from QuarMail2Attachment" - query += " where QuarMails.id=QuarMail2Attachment.quarmail_id) as attach_count" + query += " where QuarMails.id=QuarMail2Attachment.quarmail_id) as attach_count," + query += " (select count(*) from QuarMail2URI" + query += " where QuarMails.id=QuarMail2URI.quarmail_id) as uri_count" query += " from QuarMails " + self.get_where_clause(args) query += " " + self.get_limit_clause(args) + " ;" cursor.execute(query) @@ -213,10 +215,10 @@ class GulagDB: def get_quarmail(self,args): try: cursor = self.conn.cursor() - # TODO: build SQL query by args - #query = "select * from QuarMails where id='" + args['id'] + "';" query = "select *,(select count(*) from QuarMail2Attachment" - query += " where QuarMails.id=QuarMail2Attachment.quarmail_id) as attach_count" + query += " where QuarMails.id=QuarMail2Attachment.quarmail_id) as attach_count," + query += " (select count(*) from QuarMail2URI" + query += " where QuarMails.id=QuarMail2URI.quarmail_id) as uri_count" query += " from QuarMails where QuarMails.id="+ str(args['id']) +";" cursor.execute(query) data = cursor.fetchall() @@ -266,8 +268,9 @@ class GulagDB: try: cursor = self.conn.cursor() cursor.execute("insert into Attachments " + - "(filename, content_type, content_encoding) values (%s,%s,%s)", - (attach['filename'], attach['content_type'], attach['content_encoding']) + "(filename,content_type,content_encoding,magic) values (%s,%s,%s,%s)", + (attach['filename'],attach['content_type'], + attach['content_encoding'],attach['magic']) ) return cursor.lastrowid except mariadb.Error as e: diff --git a/app/GulagUtils.py b/app/GulagUtils.py index 31d0661..1bc306c 100644 --- a/app/GulagUtils.py +++ b/app/GulagUtils.py @@ -1,4 +1,5 @@ -import sys,re +import sys,re,urllib +from urllib.parse import urlparse from smtplib import SMTP def whoami(obj): @@ -6,7 +7,7 @@ def whoami(obj): def send_mail(args): try: - # FIXME: SMTP tranaport security and authentication! + # FIXME: SMTP transport security and authentication! # with SMTP(host=mailbox['smtp_server'],port=mailbox['smtp_port']) as smtp: # try: # smtp.sendmail( @@ -20,17 +21,22 @@ def send_mail(args): except TimeoutError as e: raise Exception('xyz') from e -def extract_uris(string): +def extract_uris(input_text): uris = {} uri_pattern = r'(https?:\/\/[^\s<>"]+)' - for m in re.finditer(uri_pattern, string): - uris[m.group(0)] = {} + for m in re.finditer(uri_pattern, input_text): + uri = urllib.parse.unquote(m.group(0)) + uris[uri] = {} + # extract sub-URIs (google redirector: https://www.google.de/url?sa=t&url=...) + for m2 in re.finditer(uri_pattern, uri): + suburi = urllib.parse.unquote(m2.group(0)) + uris[suburi] = {"suburi": True} return uris def extract_fqdn(uri): - uri_pattern = r'(https?:\/\/[^\s<>"]+)' - if(re.match(uri_pattern,uri)): - m = re.match(r'https?:\/\/([^:\/]+)', uri) - return m.group(1) - else: + puri = None + try: + puri = urlparse(uri) + return puri.hostname + except ValueError as e: return None diff --git a/db/gulag.sql b/db/gulag.sql index e9d34a2..fec328d 100644 --- a/db/gulag.sql +++ b/db/gulag.sql @@ -46,6 +46,7 @@ create table Attachments ( filename varchar(256) not null, content_type varchar(256) not null, content_encoding varchar(64), + magic varchar(128), comment varchar(256) )ENGINE = InnoDB; diff --git a/docker/gulag-server/debian/Dockerfile b/docker/gulag-server/debian/Dockerfile index bd83831..b2ee86d 100644 --- a/docker/gulag-server/debian/Dockerfile +++ b/docker/gulag-server/debian/Dockerfile @@ -5,11 +5,13 @@ ENV DEBIAN_FRONTEND=noninteractive \ TZ=Europe/Berlin RUN set -ex ; \ - apt-get -qq update \ - && apt-get -qq --no-install-recommends install \ - uwsgi-plugin-python3 python3-setuptools python3-flask \ - python3-flask-restful python3-mysql.connector \ - uwsgi uwsgi-plugin-python3 procps net-tools + apt-get -qq update \ + && apt-get -qq --no-install-recommends install \ + uwsgi-plugin-python3 python3-setuptools python3-flask \ + python3-flask-restful python3-mysql.connector \ + uwsgi uwsgi-plugin-python3 procps net-tools \ + python3-pip libmagic1 \ + && pip3 install python-magic RUN /bin/mkdir /config /socket /app COPY app/*.py /app/ diff --git a/gulag-openapi-2.0.yaml b/gulag-openapi-2.0.yaml index 364c3c9..79f2957 100644 --- a/gulag-openapi-2.0.yaml +++ b/gulag-openapi-2.0.yaml @@ -307,6 +307,9 @@ definitions: attach_count: type: integer description: number of attachments + uri_count: + type: integer + description: number of uris rfc822_message: type: string description: full RFC822 email message @@ -317,6 +320,7 @@ definitions: - filename - content_encoding - content_type + - magic - mailbox_id - imap_uid - href @@ -336,6 +340,9 @@ definitions: content_type: type: string example: image/jpeg + magic: + type: string + example: "PDF document, version 1.2" href: type: string description: hypermedia