|  |  |  | # pylint: skip-file
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | import os
 | 
					
						
							|  |  |  | import time
 | 
					
						
							|  |  |  | import tempfile
 | 
					
						
							|  |  |  | import threading
 | 
					
						
							|  |  |  | import urllib.parse
 | 
					
						
							|  |  |  | import pycurl
 | 
					
						
							|  |  |  | from hashlib import sha256
 | 
					
						
							|  |  |  | from io import BytesIO
 | 
					
						
							|  |  |  | from tenacity import retry, wait_random_exponential, stop_after_attempt
 | 
					
						
							|  |  |  | from common.file_helpers import mkdirs_exists_ok, atomic_write_in_dir
 | 
					
						
							|  |  |  | #  Cache chunk size
 | 
					
						
							|  |  |  | K = 1000
 | 
					
						
							|  |  |  | CHUNK_SIZE = 1000 * K
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | CACHE_DIR = os.environ.get("COMMA_CACHE", "/tmp/comma_download_cache/")
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def hash_256(link):
 | 
					
						
							|  |  |  |   hsh = str(sha256((link.split("?")[0]).encode('utf-8')).hexdigest())
 | 
					
						
							|  |  |  |   return hsh
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class URLFile:
 | 
					
						
							|  |  |  |   _tlocal = threading.local()
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   def __init__(self, url, debug=False, cache=None):
 | 
					
						
							|  |  |  |     self._url = url
 | 
					
						
							|  |  |  |     self._pos = 0
 | 
					
						
							|  |  |  |     self._length = None
 | 
					
						
							|  |  |  |     self._local_file = None
 | 
					
						
							|  |  |  |     self._debug = debug
 | 
					
						
							|  |  |  |     #  True by default, false if FILEREADER_CACHE is defined, but can be overwritten by the cache input
 | 
					
						
							|  |  |  |     self._force_download = not int(os.environ.get("FILEREADER_CACHE", "0"))
 | 
					
						
							|  |  |  |     if cache is not None:
 | 
					
						
							|  |  |  |       self._force_download = not cache
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     try:
 | 
					
						
							|  |  |  |       self._curl = self._tlocal.curl
 | 
					
						
							|  |  |  |     except AttributeError:
 | 
					
						
							|  |  |  |       self._curl = self._tlocal.curl = pycurl.Curl()
 | 
					
						
							|  |  |  |     mkdirs_exists_ok(CACHE_DIR)
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   def __enter__(self):
 | 
					
						
							|  |  |  |     return self
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   def __exit__(self, exc_type, exc_value, traceback):
 | 
					
						
							|  |  |  |     if self._local_file is not None:
 | 
					
						
							|  |  |  |       os.remove(self._local_file.name)
 | 
					
						
							|  |  |  |       self._local_file.close()
 | 
					
						
							|  |  |  |       self._local_file = None
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   @retry(wait=wait_random_exponential(multiplier=1, max=5), stop=stop_after_attempt(3), reraise=True)
 | 
					
						
							|  |  |  |   def get_length_online(self):
 | 
					
						
							|  |  |  |     c = self._curl
 | 
					
						
							|  |  |  |     c.reset()
 | 
					
						
							|  |  |  |     c.setopt(pycurl.NOSIGNAL, 1)
 | 
					
						
							|  |  |  |     c.setopt(pycurl.TIMEOUT_MS, 500000)
 | 
					
						
							|  |  |  |     c.setopt(pycurl.FOLLOWLOCATION, True)
 | 
					
						
							|  |  |  |     c.setopt(pycurl.URL, self._url)
 | 
					
						
							|  |  |  |     c.setopt(c.NOBODY, 1)
 | 
					
						
							|  |  |  |     c.perform()
 | 
					
						
							|  |  |  |     length = int(c.getinfo(c.CONTENT_LENGTH_DOWNLOAD))
 | 
					
						
							|  |  |  |     c.reset()
 | 
					
						
							|  |  |  |     return length
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   def get_length(self):
 | 
					
						
							|  |  |  |     if self._length is not None:
 | 
					
						
							|  |  |  |       return self._length
 | 
					
						
							|  |  |  |     file_length_path = os.path.join(CACHE_DIR, hash_256(self._url) + "_length")
 | 
					
						
							|  |  |  |     if os.path.exists(file_length_path) and not self._force_download:
 | 
					
						
							|  |  |  |       with open(file_length_path) as file_length:
 | 
					
						
							|  |  |  |           content = file_length.read()
 | 
					
						
							|  |  |  |           self._length = int(content)
 | 
					
						
							|  |  |  |           return self._length
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     self._length = self.get_length_online()
 | 
					
						
							|  |  |  |     if not self._force_download:
 | 
					
						
							|  |  |  |       with atomic_write_in_dir(file_length_path, mode="w") as file_length:
 | 
					
						
							|  |  |  |         file_length.write(str(self._length))
 | 
					
						
							|  |  |  |     return self._length
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   def read(self, ll=None):
 | 
					
						
							|  |  |  |     if self._force_download:
 | 
					
						
							|  |  |  |       return self.read_aux(ll=ll)
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     file_begin = self._pos
 | 
					
						
							|  |  |  |     file_end = self._pos + ll if ll is not None else self.get_length()
 | 
					
						
							|  |  |  |     assert file_end != -1, f"Remote file is empty or doesn't exist: {self._url}"
 | 
					
						
							|  |  |  |     #  We have to align with chunks we store. Position is the begginiing of the latest chunk that starts before or at our file
 | 
					
						
							|  |  |  |     position = (file_begin // CHUNK_SIZE) * CHUNK_SIZE
 | 
					
						
							|  |  |  |     response = b""
 | 
					
						
							|  |  |  |     while True:
 | 
					
						
							|  |  |  |       self._pos = position
 | 
					
						
							|  |  |  |       chunk_number = self._pos / CHUNK_SIZE
 | 
					
						
							|  |  |  |       file_name = hash_256(self._url) + "_" + str(chunk_number)
 | 
					
						
							|  |  |  |       full_path = os.path.join(CACHE_DIR, str(file_name))
 | 
					
						
							|  |  |  |       data = None
 | 
					
						
							|  |  |  |       #  If we don't have a file, download it
 | 
					
						
							|  |  |  |       if not os.path.exists(full_path):
 | 
					
						
							|  |  |  |         data = self.read_aux(ll=CHUNK_SIZE)
 | 
					
						
							|  |  |  |         with atomic_write_in_dir(full_path, mode="wb") as new_cached_file:
 | 
					
						
							|  |  |  |           new_cached_file.write(data)
 | 
					
						
							|  |  |  |       else:
 | 
					
						
							|  |  |  |         with open(full_path, "rb") as cached_file:
 | 
					
						
							|  |  |  |           data = cached_file.read()
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |       response += data[max(0, file_begin - position): min(CHUNK_SIZE, file_end - position)]
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |       position += CHUNK_SIZE
 | 
					
						
							|  |  |  |       if position >= file_end:
 | 
					
						
							|  |  |  |         self._pos = file_end
 | 
					
						
							|  |  |  |         return response
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   @retry(wait=wait_random_exponential(multiplier=1, max=5), stop=stop_after_attempt(3), reraise=True)
 | 
					
						
							|  |  |  |   def read_aux(self, ll=None):
 | 
					
						
							|  |  |  |     download_range = False
 | 
					
						
							|  |  |  |     headers = ["Connection: keep-alive"]
 | 
					
						
							|  |  |  |     if self._pos != 0 or ll is not None:
 | 
					
						
							|  |  |  |       if ll is None:
 | 
					
						
							|  |  |  |         end = self.get_length() - 1
 | 
					
						
							|  |  |  |       else:
 | 
					
						
							|  |  |  |         end = min(self._pos + ll, self.get_length()) - 1
 | 
					
						
							|  |  |  |       if self._pos >= end:
 | 
					
						
							|  |  |  |         return b""
 | 
					
						
							|  |  |  |       headers.append(f"Range: bytes={self._pos}-{end}")
 | 
					
						
							|  |  |  |       download_range = True
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     dats = BytesIO()
 | 
					
						
							|  |  |  |     c = self._curl
 | 
					
						
							|  |  |  |     c.setopt(pycurl.URL, self._url)
 | 
					
						
							|  |  |  |     c.setopt(pycurl.WRITEDATA, dats)
 | 
					
						
							|  |  |  |     c.setopt(pycurl.NOSIGNAL, 1)
 | 
					
						
							|  |  |  |     c.setopt(pycurl.TIMEOUT_MS, 500000)
 | 
					
						
							|  |  |  |     c.setopt(pycurl.HTTPHEADER, headers)
 | 
					
						
							|  |  |  |     c.setopt(pycurl.FOLLOWLOCATION, True)
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if self._debug:
 | 
					
						
							|  |  |  |       print("downloading", self._url)
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |       def header(x):
 | 
					
						
							|  |  |  |         if b'MISS' in x:
 | 
					
						
							|  |  |  |           print(x.strip())
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |       c.setopt(pycurl.HEADERFUNCTION, header)
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |       def test(debug_type, debug_msg):
 | 
					
						
							|  |  |  |        print("  debug(%d): %s" % (debug_type, debug_msg.strip()))
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |       c.setopt(pycurl.VERBOSE, 1)
 | 
					
						
							|  |  |  |       c.setopt(pycurl.DEBUGFUNCTION, test)
 | 
					
						
							|  |  |  |       t1 = time.time()
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     c.perform()
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if self._debug:
 | 
					
						
							|  |  |  |       t2 = time.time()
 | 
					
						
							|  |  |  |       if t2 - t1 > 0.1:
 | 
					
						
							|  |  |  |         print(f"get {self._url} {headers!r} {t2 - t1:.f} slow")
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     response_code = c.getinfo(pycurl.RESPONSE_CODE)
 | 
					
						
							|  |  |  |     if response_code == 416:  # Requested Range Not Satisfiable
 | 
					
						
							|  |  |  |       raise Exception(f"Error, range out of bounds {response_code} {headers} ({self._url}): {repr(dats.getvalue())[:500]}")
 | 
					
						
							|  |  |  |     if download_range and response_code != 206:  # Partial Content
 | 
					
						
							|  |  |  |       raise Exception(f"Error, requested range but got unexpected response {response_code} {headers} ({self._url}): {repr(dats.getvalue())[:500]}")
 | 
					
						
							|  |  |  |     if (not download_range) and response_code != 200:  # OK
 | 
					
						
							|  |  |  |       raise Exception(f"Error {response_code} {headers} ({self._url}): {repr(dats.getvalue())[:500]}")
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     ret = dats.getvalue()
 | 
					
						
							|  |  |  |     self._pos += len(ret)
 | 
					
						
							|  |  |  |     return ret
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   def seek(self, pos):
 | 
					
						
							|  |  |  |     self._pos = pos
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   @property
 | 
					
						
							|  |  |  |   def name(self):
 | 
					
						
							|  |  |  |     """Returns a local path to file with the URLFile's contents.
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |        This can be used to interface with modules that require local files.
 | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     if self._local_file is None:
 | 
					
						
							|  |  |  |       _, ext = os.path.splitext(urllib.parse.urlparse(self._url).path)
 | 
					
						
							|  |  |  |       local_fd, local_path = tempfile.mkstemp(suffix=ext)
 | 
					
						
							|  |  |  |       try:
 | 
					
						
							|  |  |  |         os.write(local_fd, self.read())
 | 
					
						
							|  |  |  |         local_file = open(local_path, "rb")
 | 
					
						
							|  |  |  |       except Exception:
 | 
					
						
							|  |  |  |         os.remove(local_path)
 | 
					
						
							|  |  |  |         raise
 | 
					
						
							|  |  |  |       finally:
 | 
					
						
							|  |  |  |         os.close(local_fd)
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |       self._local_file = local_file
 | 
					
						
							|  |  |  |       self.read = self._local_file.read
 | 
					
						
							|  |  |  |       self.seek = self._local_file.seek
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return self._local_file.name
 |