mirror of
https://github.com/chillout2k/gulag.git
synced 2025-12-13 16:00:18 +00:00
27 lines
764 B
Python
27 lines
764 B
Python
import sys,re,urllib
|
|
from urllib.parse import urlparse
|
|
|
|
def whoami(obj):
|
|
return type(obj).__name__ + "::" + sys._getframe(1).f_code.co_name + "(): "
|
|
|
|
def extract_uris(input_text):
|
|
uris = {}
|
|
uri_pattern = r'(https?:\/\/[^\s<>"]+)'
|
|
suburi_pattern = r'^.+(https?:\/\/[^\s<>"]+)'
|
|
for m in re.finditer(uri_pattern, input_text):
|
|
uri = urllib.parse.unquote(m.group(0))
|
|
uris[uri] = {}
|
|
# extract sub-URIs (google redirector: https://www.google.de/url?sa=t&url=...)
|
|
for m2 in re.finditer(suburi_pattern, uri):
|
|
suburi = urllib.parse.unquote(m2.group(1))
|
|
uris[suburi] = {"suburi": True}
|
|
return uris
|
|
|
|
def extract_fqdn(uri):
|
|
puri = None
|
|
try:
|
|
puri = urlparse(uri)
|
|
return puri.hostname
|
|
except ValueError as e:
|
|
return None
|