sub uri extraction bug fixed

2026-02-03 20:34:23 +00:00 · 2018-12-17 19:59:14 +01:00 · 2018-12-17 19:59:14 +01:00 · eb3a1e9bf2
commit eb3a1e9bf2
parent acda4c8e11
2 changed files with 6 additions and 3 deletions
--- a/app/Gulag.py
+++ b/app/Gulag.py
@ -166,8 +166,10 @@ class Gulag:
          ctype = part.get_content_type()
          if(ctype == 'text/plain' or ctype == 'text/html'):
            curis = {}
-            curis = extract_uris(part.get_payload(decode=True).decode("utf-8"))
+#            curis = extract_uris(part.get_payload(decode=True).decode("utf-8"))
+            curis = extract_uris(part.get_payload(decode=True).decode("utf-8","replace"))
            if(len(curis) > 0):
+              logging.info(whoami(self) + "CURIS: " + str(curis))
              uris = {**uris, **curis}
        # End for msg.walk()
        # link message with attachments
--- a/app/GulagUtils.py
+++ b/app/GulagUtils.py
@ -24,12 +24,13 @@ def send_mail(args):
 def extract_uris(input_text):
  uris = {}
  uri_pattern = r'(https?:\/\/[^\s<>"]+)'
+  suburi_pattern = r'^.+(https?:\/\/[^\s<>"]+)'
  for m in re.finditer(uri_pattern, input_text):
    uri = urllib.parse.unquote(m.group(0))
    uris[uri] = {}
    # extract sub-URIs (google redirector: https://www.google.de/url?sa=t&url=...)
-    for m2 in re.finditer(uri_pattern, uri):
-      suburi = urllib.parse.unquote(m2.group(0))
+    for m2 in re.finditer(suburi_pattern, uri):
+      suburi = urllib.parse.unquote(m2.group(1))
      uris[suburi] = {"suburi": True}
  return uris