attachment handling with magic and sub-URI parsing

This commit is contained in:
Dominik Chilla 2018-12-16 23:35:10 +01:00
parent 5723b18367
commit acda4c8e11
7 changed files with 62 additions and 23 deletions

View File

@ -97,6 +97,7 @@ class QuarMail:
msg_size = None msg_size = None
href = None href = None
attach_count = None attach_count = None
uri_count = None
def __init__(self,qm_ref): def __init__(self,qm_ref):
if 'id' not in qm_ref: if 'id' not in qm_ref:
@ -139,6 +140,8 @@ class QuarMail:
self.href = qm_ref['href'] self.href = qm_ref['href']
if 'attach_count' in qm_ref: if 'attach_count' in qm_ref:
self.attach_count = qm_ref['attach_count'] self.attach_count = qm_ref['attach_count']
if 'uri_count' in qm_ref:
self.uri_count = qm_ref['uri_count']
class AttachmentException(Exception): class AttachmentException(Exception):
message = None message = None
@ -150,6 +153,7 @@ class Attachment:
filename = None filename = None
content_type = None content_type = None
content_encoding = None content_encoding = None
magic = None
comment = None comment = None
mailbox_id = None mailbox_id = None
imap_uid = None imap_uid = None
@ -167,6 +171,9 @@ class Attachment:
self.content_type = at_ref['content_type'] self.content_type = at_ref['content_type']
if 'content_encoding' in at_ref: if 'content_encoding' in at_ref:
self.content_encoding = at_ref['content_encoding'] self.content_encoding = at_ref['content_encoding']
if 'magic' not in at_ref:
raise AttachmentException("'magic' is mandatory!")
self.magic = at_ref['magic']
if 'comment' in at_ref: if 'comment' in at_ref:
self.comment = at_ref['comment'] self.comment = at_ref['comment']
if 'mailbox_id' not in at_ref: if 'mailbox_id' not in at_ref:

View File

@ -1,4 +1,4 @@
import json,sys,os,logging,re import json,sys,os,logging,re,magic
import email,email.header,email.message import email,email.header,email.message
from GulagDB import GulagDB,GulagDBException from GulagDB import GulagDB,GulagDBException
from GulagMailbox import IMAPmailbox,IMAPmailboxException from GulagMailbox import IMAPmailbox,IMAPmailboxException
@ -30,6 +30,7 @@ class Gulag:
raise GulagException(whoami(self) + "Logging not configured!") raise GulagException(whoami(self) + "Logging not configured!")
if('filename' in self.config['logging'] and if('filename' in self.config['logging'] and
len(self.config['logging']['filename']) > 0): len(self.config['logging']['filename']) > 0):
# TODO: Exception handling
logging.basicConfig( logging.basicConfig(
filename=self.config['logging']['filename'], filename=self.config['logging']['filename'],
format='%(asctime)s %(levelname)s %(message)s', format='%(asctime)s %(levelname)s %(message)s',
@ -148,10 +149,16 @@ class Gulag:
else: else:
# filename isn´t encoded # filename isn´t encoded
filename = filename[0][0] filename = filename[0][0]
attach_magic = None
try:
attach_magic = magic.from_buffer(part.get_payload(decode=True))
except:
logging.info(whoami(self) + ": " + str(sys.exc_info()))
attach_id = self.db.add_attachment({ attach_id = self.db.add_attachment({
'filename': filename, 'filename': filename,
'content_type': part.get_content_type(), 'content_type': part.get_content_type(),
'content_encoding': part['Content-Transfer-Encoding'] 'content_encoding': part['Content-Transfer-Encoding'],
'magic': attach_magic
}) })
attachments.append(attach_id) attachments.append(attach_id)
# End if part.get_filename() # End if part.get_filename()
@ -168,6 +175,9 @@ class Gulag:
for quarmail_id in quarmail_ids: for quarmail_id in quarmail_ids:
for attachment_id in attachments: for attachment_id in attachments:
self.db.quarmail2attachment(str(quarmail_id), str(attachment_id)) self.db.quarmail2attachment(str(quarmail_id), str(attachment_id))
logging.info(whoami(self) +
"Attachment("+str(attachment_id)+")@QuarMail("+str(quarmail_id)+") imported"
)
# link message with uris # link message with uris
if(len(uris) > 0): if(len(uris) > 0):
for quarmail_id in quarmail_ids: for quarmail_id in quarmail_ids:
@ -178,6 +188,9 @@ class Gulag:
"fqdn": extract_fqdn(uri) "fqdn": extract_fqdn(uri)
}) })
self.db.quarmail2uri(str(quarmail_id), str(uri_id)) self.db.quarmail2uri(str(quarmail_id), str(uri_id))
logging.info(whoami(self) +
"URI("+str(uri_id)+")@QuarMail("+str(quarmail_id)+") imported"
)
except GulagDBException as e: except GulagDBException as e:
logging.error(whoami(self) + e.message) logging.error(whoami(self) + e.message)
# End for(unseen) # End for(unseen)

View File

@ -185,7 +185,9 @@ class GulagDB:
try: try:
cursor = self.conn.cursor() cursor = self.conn.cursor()
query = "select *,(select count(*) from QuarMail2Attachment" query = "select *,(select count(*) from QuarMail2Attachment"
query += " where QuarMails.id=QuarMail2Attachment.quarmail_id) as attach_count" query += " where QuarMails.id=QuarMail2Attachment.quarmail_id) as attach_count,"
query += " (select count(*) from QuarMail2URI"
query += " where QuarMails.id=QuarMail2URI.quarmail_id) as uri_count"
query += " from QuarMails " + self.get_where_clause(args) query += " from QuarMails " + self.get_where_clause(args)
query += " " + self.get_limit_clause(args) + " ;" query += " " + self.get_limit_clause(args) + " ;"
cursor.execute(query) cursor.execute(query)
@ -213,10 +215,10 @@ class GulagDB:
def get_quarmail(self,args): def get_quarmail(self,args):
try: try:
cursor = self.conn.cursor() cursor = self.conn.cursor()
# TODO: build SQL query by args
#query = "select * from QuarMails where id='" + args['id'] + "';"
query = "select *,(select count(*) from QuarMail2Attachment" query = "select *,(select count(*) from QuarMail2Attachment"
query += " where QuarMails.id=QuarMail2Attachment.quarmail_id) as attach_count" query += " where QuarMails.id=QuarMail2Attachment.quarmail_id) as attach_count,"
query += " (select count(*) from QuarMail2URI"
query += " where QuarMails.id=QuarMail2URI.quarmail_id) as uri_count"
query += " from QuarMails where QuarMails.id="+ str(args['id']) +";" query += " from QuarMails where QuarMails.id="+ str(args['id']) +";"
cursor.execute(query) cursor.execute(query)
data = cursor.fetchall() data = cursor.fetchall()
@ -266,8 +268,9 @@ class GulagDB:
try: try:
cursor = self.conn.cursor() cursor = self.conn.cursor()
cursor.execute("insert into Attachments " + cursor.execute("insert into Attachments " +
"(filename, content_type, content_encoding) values (%s,%s,%s)", "(filename,content_type,content_encoding,magic) values (%s,%s,%s,%s)",
(attach['filename'], attach['content_type'], attach['content_encoding']) (attach['filename'],attach['content_type'],
attach['content_encoding'],attach['magic'])
) )
return cursor.lastrowid return cursor.lastrowid
except mariadb.Error as e: except mariadb.Error as e:

View File

@ -1,4 +1,5 @@
import sys,re import sys,re,urllib
from urllib.parse import urlparse
from smtplib import SMTP from smtplib import SMTP
def whoami(obj): def whoami(obj):
@ -6,7 +7,7 @@ def whoami(obj):
def send_mail(args): def send_mail(args):
try: try:
# FIXME: SMTP tranaport security and authentication! # FIXME: SMTP transport security and authentication!
# with SMTP(host=mailbox['smtp_server'],port=mailbox['smtp_port']) as smtp: # with SMTP(host=mailbox['smtp_server'],port=mailbox['smtp_port']) as smtp:
# try: # try:
# smtp.sendmail( # smtp.sendmail(
@ -20,17 +21,22 @@ def send_mail(args):
except TimeoutError as e: except TimeoutError as e:
raise Exception('xyz') from e raise Exception('xyz') from e
def extract_uris(string): def extract_uris(input_text):
uris = {} uris = {}
uri_pattern = r'(https?:\/\/[^\s<>"]+)' uri_pattern = r'(https?:\/\/[^\s<>"]+)'
for m in re.finditer(uri_pattern, string): for m in re.finditer(uri_pattern, input_text):
uris[m.group(0)] = {} uri = urllib.parse.unquote(m.group(0))
uris[uri] = {}
# extract sub-URIs (google redirector: https://www.google.de/url?sa=t&url=...)
for m2 in re.finditer(uri_pattern, uri):
suburi = urllib.parse.unquote(m2.group(0))
uris[suburi] = {"suburi": True}
return uris return uris
def extract_fqdn(uri): def extract_fqdn(uri):
uri_pattern = r'(https?:\/\/[^\s<>"]+)' puri = None
if(re.match(uri_pattern,uri)): try:
m = re.match(r'https?:\/\/([^:\/]+)', uri) puri = urlparse(uri)
return m.group(1) return puri.hostname
else: except ValueError as e:
return None return None

View File

@ -46,6 +46,7 @@ create table Attachments (
filename varchar(256) not null, filename varchar(256) not null,
content_type varchar(256) not null, content_type varchar(256) not null,
content_encoding varchar(64), content_encoding varchar(64),
magic varchar(128),
comment varchar(256) comment varchar(256)
)ENGINE = InnoDB; )ENGINE = InnoDB;

View File

@ -9,7 +9,9 @@ RUN set -ex ; \
&& apt-get -qq --no-install-recommends install \ && apt-get -qq --no-install-recommends install \
uwsgi-plugin-python3 python3-setuptools python3-flask \ uwsgi-plugin-python3 python3-setuptools python3-flask \
python3-flask-restful python3-mysql.connector \ python3-flask-restful python3-mysql.connector \
uwsgi uwsgi-plugin-python3 procps net-tools uwsgi uwsgi-plugin-python3 procps net-tools \
python3-pip libmagic1 \
&& pip3 install python-magic
RUN /bin/mkdir /config /socket /app RUN /bin/mkdir /config /socket /app
COPY app/*.py /app/ COPY app/*.py /app/

View File

@ -307,6 +307,9 @@ definitions:
attach_count: attach_count:
type: integer type: integer
description: number of attachments description: number of attachments
uri_count:
type: integer
description: number of uris
rfc822_message: rfc822_message:
type: string type: string
description: full RFC822 email message description: full RFC822 email message
@ -317,6 +320,7 @@ definitions:
- filename - filename
- content_encoding - content_encoding
- content_type - content_type
- magic
- mailbox_id - mailbox_id
- imap_uid - imap_uid
- href - href
@ -336,6 +340,9 @@ definitions:
content_type: content_type:
type: string type: string
example: image/jpeg example: image/jpeg
magic:
type: string
example: "PDF document, version 1.2"
href: href:
type: string type: string
description: hypermedia description: hypermedia