From eb3a1e9bf20220c6c797b80cda061590dd43ada7 Mon Sep 17 00:00:00 2001 From: Dominik Chilla Date: Mon, 17 Dec 2018 19:59:14 +0100 Subject: [PATCH] sub uri extraction bug fixed --- app/Gulag.py | 4 +++- app/GulagUtils.py | 5 +++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/app/Gulag.py b/app/Gulag.py index f497380..0f56a10 100644 --- a/app/Gulag.py +++ b/app/Gulag.py @@ -166,8 +166,10 @@ class Gulag: ctype = part.get_content_type() if(ctype == 'text/plain' or ctype == 'text/html'): curis = {} - curis = extract_uris(part.get_payload(decode=True).decode("utf-8")) +# curis = extract_uris(part.get_payload(decode=True).decode("utf-8")) + curis = extract_uris(part.get_payload(decode=True).decode("utf-8","replace")) if(len(curis) > 0): + logging.info(whoami(self) + "CURIS: " + str(curis)) uris = {**uris, **curis} # End for msg.walk() # link message with attachments diff --git a/app/GulagUtils.py b/app/GulagUtils.py index 1bc306c..2c4a3ec 100644 --- a/app/GulagUtils.py +++ b/app/GulagUtils.py @@ -24,12 +24,13 @@ def send_mail(args): def extract_uris(input_text): uris = {} uri_pattern = r'(https?:\/\/[^\s<>"]+)' + suburi_pattern = r'^.+(https?:\/\/[^\s<>"]+)' for m in re.finditer(uri_pattern, input_text): uri = urllib.parse.unquote(m.group(0)) uris[uri] = {} # extract sub-URIs (google redirector: https://www.google.de/url?sa=t&url=...) - for m2 in re.finditer(uri_pattern, uri): - suburi = urllib.parse.unquote(m2.group(0)) + for m2 in re.finditer(suburi_pattern, uri): + suburi = urllib.parse.unquote(m2.group(1)) uris[suburi] = {"suburi": True} return uris