sub uri extraction bug fixed

This commit is contained in:
Dominik Chilla 2018-12-17 19:59:14 +01:00
parent acda4c8e11
commit eb3a1e9bf2
2 changed files with 6 additions and 3 deletions

View File

@ -166,8 +166,10 @@ class Gulag:
ctype = part.get_content_type()
if(ctype == 'text/plain' or ctype == 'text/html'):
curis = {}
curis = extract_uris(part.get_payload(decode=True).decode("utf-8"))
# curis = extract_uris(part.get_payload(decode=True).decode("utf-8"))
curis = extract_uris(part.get_payload(decode=True).decode("utf-8","replace"))
if(len(curis) > 0):
logging.info(whoami(self) + "CURIS: " + str(curis))
uris = {**uris, **curis}
# End for msg.walk()
# link message with attachments

View File

@ -24,12 +24,13 @@ def send_mail(args):
def extract_uris(input_text):
uris = {}
uri_pattern = r'(https?:\/\/[^\s<>"]+)'
suburi_pattern = r'^.+(https?:\/\/[^\s<>"]+)'
for m in re.finditer(uri_pattern, input_text):
uri = urllib.parse.unquote(m.group(0))
uris[uri] = {}
# extract sub-URIs (google redirector: https://www.google.de/url?sa=t&url=...)
for m2 in re.finditer(uri_pattern, uri):
suburi = urllib.parse.unquote(m2.group(0))
for m2 in re.finditer(suburi_pattern, uri):
suburi = urllib.parse.unquote(m2.group(1))
uris[suburi] = {"suburi": True}
return uris