From eb3a1e9bf20220c6c797b80cda061590dd43ada7 Mon Sep 17 00:00:00 2001
From: Dominik Chilla <dominik@zwackl.de>
Date: Mon, 17 Dec 2018 19:59:14 +0100
Subject: [PATCH] sub uri extraction bug fixed

---
 app/Gulag.py      | 4 +++-
 app/GulagUtils.py | 5 +++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/app/Gulag.py b/app/Gulag.py
index f497380..0f56a10 100644
--- a/app/Gulag.py
+++ b/app/Gulag.py
@@ -166,8 +166,10 @@ class Gulag:
           ctype = part.get_content_type()
           if(ctype == 'text/plain' or ctype == 'text/html'):
             curis = {}
-            curis = extract_uris(part.get_payload(decode=True).decode("utf-8"))
+#            curis = extract_uris(part.get_payload(decode=True).decode("utf-8"))
+            curis = extract_uris(part.get_payload(decode=True).decode("utf-8","replace"))
             if(len(curis) > 0):
+              logging.info(whoami(self) + "CURIS: " + str(curis))
               uris = {**uris, **curis}
         # End for msg.walk()
         # link message with attachments
diff --git a/app/GulagUtils.py b/app/GulagUtils.py
index 1bc306c..2c4a3ec 100644
--- a/app/GulagUtils.py
+++ b/app/GulagUtils.py
@@ -24,12 +24,13 @@ def send_mail(args):
 def extract_uris(input_text):
   uris = {}
   uri_pattern = r'(https?:\/\/[^\s<>"]+)'
+  suburi_pattern = r'^.+(https?:\/\/[^\s<>"]+)'
   for m in re.finditer(uri_pattern, input_text):
     uri = urllib.parse.unquote(m.group(0))
     uris[uri] = {}
     # extract sub-URIs (google redirector: https://www.google.de/url?sa=t&url=...)
-    for m2 in re.finditer(uri_pattern, uri):
-      suburi = urllib.parse.unquote(m2.group(0))
+    for m2 in re.finditer(suburi_pattern, uri):
+      suburi = urllib.parse.unquote(m2.group(1))
       uris[suburi] = {"suburi": True}
   return uris