import jellyfish
import time
import urllib
import os
[docs]class Download:
'''
Utility functions for downloading data
'''
[docs] @staticmethod
def getURLContent(url:str):
with urllib.request.urlopen(url) as urlResponse:
content = urlResponse.read().decode()
return content
[docs] @staticmethod
def getFileContent(path:str):
with open(path, "r") as file:
content = file.read()
return content
[docs] @staticmethod
def needsDownload(filePath:str,force:bool=False)->bool:
'''
check if a download of the given filePath is necessary that is the file
does not exist has a size of zero or the download should be forced
Args:
filePath(str): the path of the file to be checked
force(bool): True if the result should be forced to True
Return:
bool: True if a download for this file needed
'''
if not os.path.isfile(filePath):
result=True
else:
stats=os.stat(filePath)
size=stats.st_size
result=force or size==0
return result
[docs]class Profiler:
'''
simple profiler
'''
def __init__(self,msg,profile=True):
'''
construct me with the given msg and profile active flag
Args:
msg(str): the message to show if profiling is active
profile(bool): True if messages should be shown
'''
self.msg=msg
self.profile=profile
self.starttime=time.time()
if profile:
print(f"Starting {msg} ...")
[docs] def time(self,extraMsg=""):
'''
time the action and print if profile is active
'''
elapsed=time.time()-self.starttime
if self.profile:
print(f"{self.msg}{extraMsg} took {elapsed:5.1f} s")
return elapsed
[docs]def remove_non_ascii(s):
'''
Remove non ascii chars from the given string
Args:
s:
string: The string to remove chars from
Returns:
string: The result string with non-ascii chars removed
Hat tip: http://stackoverflow.com/a/1342373/2367526
'''
return "".join(i for i in s if ord(i) < 128)
[docs]def fuzzy_match(s1, s2, max_dist=.8):
'''
Fuzzy match the given two strings with the given maximum distance
jellyfish jaro_winkler_similarity based on https://en.wikipedia.org/wiki/Jaro-Winkler_distance
Args:
s1:
string: First string
s2:
string: Second string
max_dist:
float: The distance - default: 0.8
Returns:
True if the match is greater equals max_dist. Otherwise false
'''
return jellyfish.jaro_winkler_similarity(s1, s2) >= max_dist