Source code for pyremotedata.implicit_mount

"""
This module provides a pythonic interface for downloading files from a remote directory using the SFTP protocol.

The main functionality of this package is provided through the use of the ImplicitMount, IOHandler and RemotePathIterator classes:

* The **ImplicitMount** class provides a low-level wrapper for the LFTP shell, which is used to communicate with the remote directory, and should only be used directly by advanced users.

* The **IOHandler** class provides a high-level wrapper for the ImplicitMount class, which provides human-friendly methods for downloading files from a remote directory without the need to have technical knowledge on how to use LFTP.

* The **RemotePathIterator** class provides a high-level wrapper for the IOHandler class, and handles asynchronous streaming of files from a remote directory to a local directory using thread-safe buffers.
"""

import logging
# Standard library imports
import os
# Threading and subprocess imports
import queue
import re
import shutil
import subprocess
import tempfile
import threading
import time
import uuid
from queue import Queue
from random import choice, choices, shuffle
from typing import List, Set, Tuple, Union

# Internal import
from pyremotedata import CLEAR_LINE, ESC_EOL, main_logger, module_logger
from pyremotedata.config import get_implicit_mount_config


[docs] class ImplicitMount: """ This is a low-level wrapper of LFTP, which provides a pythonic interface for executing LFTP commands and reading the output. It provides a robust and efficient backend for communicating with a remote storage server using the SFTP protocol, using a persistent LFTP shell handled in the background by a subprocess. It is designed to be used as a base class for higher-level wrappers, such as the IOHandler class, or as a standalone class for users familiar with LFTP. OBS: The attributes of this method should not be used unless for development or advanced use cases, all responsibility in this case is on the user. TODO: This class relies on a proper SSH setup on your machine (and the remote server) for passwordless SFTP. Thoroughly test this on a fresh install, and add instructions, and the possibility for automatic setup, for setting up passwordless SFTP and SSH keys, as well as proper error handling for when this is not set up correctly. TODO: Add further arguments such as port, password, etc. to the constructor. Args: user (str): The username to use for connecting to the remote directory. remote (str): The remote server to connect to. verbose (bool): If True, print the commands executed by the class. .. <Sphinx comment Methods: format_options(): Format a dictionary of options into a string of command line arguments. execute_command(): Execute a command on the LFTP shell. mount(): Mount the remote directory. unmount(): Unmount the remote directory. pget(): Download a single file from the remote directory using multiple connections. put(): Upload a single file to the remote directory. ls(): List the contents of a directory (on the remote). OBS: In contrast to the other LFTP commands, this function has a lot of additional functionality, such as recursive listing and caching. lls(): Locally list the contents of a directory. cd(): Change the current directory (on the remote). pwd(): Get the current directory (on the remote). lcd(): Change the current directory (locally). lpwd(): Get the current directory (locally). mirror(): Download a directory from the remote. .. Sphinx comment> """ time_stamp_pattern = re.compile(r"^\s*(\S+\s+){8}") # This is used to strip the timestamp from the output of the lftp shell END_OF_OUTPUT = '# LFTP_END_OF_OUTPUT_IDENTIFIER {uuid} #' # This is used to signal the end of output when reading from stdout def __init__(self, user: str= None, remote: str=None, verbose: bool=main_logger.isEnabledFor(logging.DEBUG)): # Default argument configuration and type checking self.default_config = get_implicit_mount_config() if user is None: user = self.default_config['user'] if remote is None: remote = self.default_config['remote'] if not isinstance(user, str): raise TypeError("Expected str, got {}".format(type(user))) if not isinstance(remote, str): raise TypeError("Expected str, got {}".format(type(remote))) if not isinstance(verbose, bool): raise TypeError("Expected bool, got {}".format(type(verbose))) # Set attributes self.user = user self.password = "" # Assume we use passwordless lftp (authentication is handled by ssh keys, not sftp) self.remote = remote self.lftp_shell = None self.verbose = verbose self.stdout_queue = Queue() self.stderr_queue = Queue() self.lock = threading.Lock()
[docs] @staticmethod def format_options(**kwargs) -> str: """ Takes a dictionary of options and formats them into a string of command line arguments suitable for LFTP. Args: **kwargs: Keyword arguments to format. Returns: str: The formatted arguments. """ options = [] for key, value in kwargs.items(): # main_logger.debug(f'key: |{key}|, value: |{value}|') prefix = "-" if len(key) == 1 else "--" this_option = f"{prefix}{key}" if value is not None and value != "": this_option += f" {str(value)}" options.append(this_option) options = " ".join(options) return options
def _readerthread(self, stream, queue: Queue): # No longer static while True: output = stream.readline() if output: queue.put(output) else: break def _read_stdout(self, timeout: float = 0, strip_timestamp: bool = True, uuid_str: str = None) -> List[str]: EoU = self.END_OF_OUTPUT.format(uuid=uuid_str) lines = [] start_time = time.time() while True: if timeout and (time.time() - start_time > timeout): raise TimeoutError("Timeout while reading stdout") if not self.stdout_queue.empty(): line = self.stdout_queue.get() if EoU in line: break if strip_timestamp: line = re.sub(self.time_stamp_pattern, "", line) # if not line.startswith("wait "): if self.verbose: main_logger.info(line.strip()) lines.append(line.strip()) else: err = self._read_stderr() if err and not err.startswith("wait: no current job"): raise Exception(f"Error while executing command: {err}") time.sleep(0.001) # Moved to the else clause return lines def _read_stderr(self) -> str: errors = [] while not self.stderr_queue.empty(): errors.append(self.stderr_queue.get()) return ''.join(errors)
[docs] def execute_command(self, command: str, output: bool=True, blocking: bool=True, execute: bool=True, default_args: Union[dict, None]=None, **kwargs) -> Union[str, List[str], None]: """ Executes a command on the LFTP shell. Args: command (str): The command to execute. output (bool): If True, the function will return the output of the command. blocking (bool): If True, the function will block until the command is complete. If output is True, blocking must also be True. execute (bool): If True, the function will execute the command, otherwise it will return the command as a string. default_args (Union[dict, None]): A dictionary of default arguments to use for the command. If None, no default arguments will be used. **kwargs: Keyword arguments to pass to the command. Returns: Union[str, List[str], None]: If execute is False, the function will return the command as a string. If output is True, returns a string, list or None depending of the number of output lines, otherwise returns None. """ # Merge default arguments and keyword arguments. Keyword arguments will override default arguments. if not isinstance(default_args, dict) and default_args is not None: raise TypeError("Expected dict or None, got {}".format(type(default_args))) # Combine default arguments and keyword arguments if default_args is None: default_args = {} # Remove optional "uuid_str" argument from kwargs if "uuid_str" in kwargs: uuid_str = kwargs.pop("uuid_str") else: uuid_str = None # Combine default arguments and kwargs args = {**default_args, **kwargs} # Format arguments formatted_args = self.format_options(**args) # Combine command and arguments full_command = f"{command} {formatted_args}" if formatted_args else command if output: if uuid_str is None: uuid_str = str(uuid.uuid4()) if not blocking: full_command += " &" if not execute: return full_command else: output = self._execute_command(full_command, output=output, blocking=blocking, uuid_str=uuid_str) if isinstance(output, list): return output elif output is None: return [] else: raise TypeError("Expected list or None, got {}".format(type(output)))
def _execute_command(self, command: str, output: bool=True, blocking: bool=True, uuid_str: str=None) -> Union[List[str], None]: """ ## DO NOT USE THIS FUNCTION DIRECTLY, USE `ImplicitMount.execute_command` INSTEAD. Executes a command on the LFTP shell. Args: command (str): The command to execute. output (bool): If True, the function will return the output of the command. blocking (bool): If True, the function will block until the command is complete. If output is True, blocking must also be True. uuid_str (str): A unique identifier for the command. Must be specified if output is True. Returns: Union[List[str], None]: If output is True, the function will return a list of strings each containing one line of the output of the command, otherwise it will return None. """ if output and not blocking: raise ValueError("Non-blocking output is not supported.") if uuid_str is None and (blocking or output): uuid_str = str(uuid.uuid4()) # raise ValueError("uuid_str must be specified if output is True.") if not command.endswith("\n"): command += "\n" with self.lock: # TODO: Is it safe to assume that the order of the output is the same as the order of the commands? Why is it "<command> <end_of_output> wait" and not "<command> wait <end_of_output>"? ## Assemble command # Blocking and end of output logic if blocking or output: command += f"echo {self.END_OF_OUTPUT.format(uuid=uuid_str)}\n" if blocking: command += "wait\n" # Execute command if self.verbose: main_logger.info(f"Executing command: {command}") self.lftp_shell.stdin.write(command) self.lftp_shell.stdin.flush() # Read output if output: return self._read_stdout(uuid_str=uuid_str) elif blocking: self._read_stdout(uuid_str=uuid_str) return None
[docs] def mount(self, lftp_settings: Union[dict, None]=None) -> None: """ Mount the remote directory. Args: lftp_settings (Union[dict, None]): A dictionary of LFTP settings to use for mounting the remote directory. If None, the default settings will be used. Raises: Exception: If the subprocess fails to start. RuntimeError: If the connection to the remote directory fails. """ # set mirror:use-pget-n 5;set net:limit-rate 0;set xfer:parallel 5;set mirror:parallel-directories true;set ftp:sync-mode off;" # Merge default settings and user settings. User settings will override default settings. lftp_settings = {**self.default_config['lftp'], **lftp_settings} if lftp_settings is not None else self.default_config['lftp'] # Format settings lftp_settings_str = "" for key, value in lftp_settings.items(): lftp_settings_str += f" set {key} {value};" if self.verbose: main_logger.info(f"Mounting {self.remote} as {self.user}") # "Mount" the remote directory using an lftp shell with the sftp protocol and the specified user and remote, and the specified lftp settings lftp_mount_cmd = f'open -u {self.user},{self.password} -p 2222 sftp://{self.remote};{lftp_settings_str}' if self.verbose: main_logger.info(f"Executing command: lftp") # Start the lftp shell try: self.lftp_shell = subprocess.Popen( executable="lftp", args=[], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, bufsize=0, ) except Exception as e: # TODO: The exception should probably be a bit more specific raise Exception("Failed to start subprocess.") from e # Start the stdout and stderr reader threads self.stdout_thread = threading.Thread( target=self._readerthread, args=(self.lftp_shell.stdout, self.stdout_queue) ) self.stderr_thread = threading.Thread( target=self._readerthread, args=(self.lftp_shell.stderr, self.stderr_queue) ) # Set the threads as daemon threads so that they will be terminated when the main thread terminates self.stdout_thread.daemon = True self.stderr_thread.daemon = True # Start the threads self.stdout_thread.start() self.stderr_thread.start() # Execute the mount command on the lftp shell (connect to the remote directory) self.execute_command(lftp_mount_cmd, output=False, blocking=False) self.cd(self.default_config['default_remote_dir']) if self.verbose: main_logger.info("Waiting for connection...") # Check if we are connected to the remote directory by executing the pwd command on the lftp shell connected_path = self.pwd() if self.verbose: main_logger.info(f"Connected to {connected_path}") if not connected_path: self.unmount() raise RuntimeError(f"Failed to connect. Check internet connection or if {self.remote} is online.")
[docs] def unmount(self, timeout: float = 1) -> None: """ Unmount the remote directory. Args: timeout (float): The maximum time to wait for the lftp shell to terminate. If the timeout is exceeded, the lftp shell will be forcefully terminated. """ # Close Popen streams explicitly self.execute_command("exit kill top", output=False, blocking=False) waited = 0 while True: if self.lftp_shell.poll() is not None: break time.sleep(0.1) waited += 0.1 if waited > timeout: self.lftp_shell.terminate() break self.lftp_shell.stdout.close() self.lftp_shell.stdin.close() self.lftp_shell.stderr.close() self.lftp_shell = None
[docs] def pget(self, remote_path: str, local_destination: str, blocking: bool=True, execute: bool=True, output: Union[bool, None]=None, **kwargs): """ Download a single file from the remote directory using the LFTP command `pget`. Args: remote_path (str): The remote file to download. local_destination (str): The local destination to download the file to. blocking (bool): If True, the function will block until the download is complete. execute (bool): If True, the function will execute the pget command, otherwise it will return the command as a string. output (Union[bool, None]): If True, the function will return the absolute local path of the downloaded file, otherwise it will return None. Returns: Union[None, str]: If output is True, the function will return the absolute local path of the downloaded file, otherwise it will return None. """ if output is None: output = blocking default_args = {'n': 5} args = {**default_args, **kwargs} formatted_args = self.format_options(**args) full_command = f'pget {formatted_args} "{remote_path}" -o "{local_destination}"' exec_output = self.execute_command( full_command, output=output, blocking=blocking, execute=execute ) if not execute: return exec_output # Construct and return the absolute local path file_name = os.path.basename(remote_path) abs_local_path = os.path.abspath(os.path.join(local_destination, file_name)) return abs_local_path
[docs] def put(self, local_path: str, remote_destination: Union[str, None]=None, blocking: bool=True, execute: bool=True, output: Union[bool, None]=None, **kwargs): """ Upload a single file to the remote directory using the LFTP command `put`. TODO: Is it really just a single file? Args: local_path (str): The local file to upload. remote_destination (str, optional): The remote destination to upload the file to. If None, the file will be uploaded to the current remote directory. blocking (bool): If True, the function will block until the upload is complete. execute (bool): If True, the function will execute the put command, otherwise it will return the command as a string. output (Union[bool, None]): If True, the function will return the absolute remote path of the uploaded file, otherwise it will return None. **kwargs: Keyword arguments to pass to the put command. """ def source_destination(local_path: Union[str, List[str]], remote_destination: Union[str, List[str], None]=None) -> str: if isinstance(local_path, str): local_path = [local_path] if not isinstance(local_path, list): raise TypeError("Expected list or str, got {}".format(type(local_path))) if remote_destination is None: if isinstance(local_path, list): remote_destination = [os.path.basename(p) for p in local_path] elif isinstance(local_path, str): remote_destination = os.path.basename(local_path) else: raise TypeError("Expected list or str, got {}".format(type(local_path))) elif isinstance(remote_destination, str): remote_destination = [remote_destination] if not isinstance(remote_destination, list): raise TypeError("Expected list or str, got {}".format(type(remote_destination))) if len(local_path) != len(remote_destination): raise ValueError("Expected local_path and remote_destination to have the same length, got {} and {} instead.".format(len(local_path), len(remote_destination))) return remote_destination, " ".join([f'"{l}" -o "{r}""'for l, r in zip(local_path, remote_destination)]) if output is None: output = blocking # OBS: The online manual for LFTP is invalid for put (at least on ERDA); the included "P" option for the put command does not exist default_args = {} args = {**default_args, **kwargs} formatted_args = self.format_options(**args) remote_destination, src_to_dst = source_destination(local_path, remote_destination) full_command = f"put {formatted_args} {src_to_dst}" exec_output = self.execute_command( full_command, output=output, blocking=blocking, execute=execute ) if not execute: return exec_output # Construct and return the absolute remote path file_name = os.path.basename(os.path.abspath(local_path)) rpwd = self.pwd() abs_remote_path = [rpwd + "/" + r for r in remote_destination] return abs_remote_path
[docs] def ls(self, path: str = ".", recursive: bool=False, use_cache: bool=True, pbar : int=0, top : bool=True) -> List[str]: """ Find all files in the given remote directory using the LFTP command `cls`. Can be used recursively, even though LFTP does not support recursive listing with the `cls` command. Args: path (str): The remote directory to search in. recursive (bool): If True, the function will search recursively. use_cache (bool): If True, the function will use `cls`, otherwise it will use `recls`. `recls` forces a refresh of the cache. pbar (int): If 1, prints something to show that the process is not dead, only relevant if recursive is True. Defaults to 0. top (bool): DO NOT USE! Flag to indicate whether the recursion is at the top level. Returns: Union[None, str, List[str]]: If the directory is empty, the function will return None, if the directory contains one file, the function will return a string, otherwise it will return a list of strings. """ if path.startswith(".."): raise NotImplementedError("ls does not support relative backtracing paths yet.") elif path.startswith("./"): path = path[2:] elif path == ".": path = "" # This function is used to sanitize the output of the lftp shell # It is quite inefficient, but it is only used for the ls command, which is not performance critical? # Folder index files should be used instead of ls in most cases, # but this function is still useful for debugging and for creating the folder index files def sanitize_path(l, path) -> None: # Empty case (base case 1) if not path: pass # Single path case (base case 2) elif isinstance(path, str): # Skip "." and ".." paths and folder index files (these are created by the folder index command, and should be treated as hidden files) if len(path) >= 2 and not "folder_index.txt" in path: # Remove leading "./" from paths if path.startswith("."): path = path[2:] # Append path to list (this a mutative operation) l += [path] # Multiple paths case (recursive case) elif isinstance(path, list): for p in path: sanitize_path(l, p) else: raise TypeError("Expected list, str or None, got {}".format(type(path))) return l # Recursive ls is implemented by using the "cls" command, which returns a list of permissions and paths # and then recursively calling ls on each of the paths that are directories, # which is determined by checking if the permission starts with "d" if recursive: recls = "" if use_cache else "re" if pbar: main_logger.info(f"{CLEAR_LINE}Retrieving file list{'.'*pbar}{ESC_EOL}") this_level = self.execute_command(f'{recls}cls "{path}" -1 --perm') # If the directory contains one or no files if isinstance(this_level, str) or this_level is None: this_level = sanitize_path([], this_level) output = [] for perm_path in this_level: if not " " in perm_path: continue perm, path = perm_path.split(" ", 1) if perm.startswith("d"): pbar = (pbar % 10) + 1 if pbar else pbar output += self.ls(path, recursive=True, pbar=pbar, top=False) else: sanitize_path(output, path) # Non-recursive case else: output = sanitize_path([], self.execute_command(f'cls "{path}" -1')) # Clear the progress bar if end of top-level if pbar and top: main_logger.info(f"{CLEAR_LINE}{ESC_EOL}\n") # Check if the output is a list if not isinstance(output, list): TypeError("Expected list, got {}".format(type(output))) return output
[docs] def lls(self, local_path: str, **kwargs) -> List[str]: """ Find all files in the given local directory using the LFTP command `!ls` or `!find`. OBS: This function should probably not be used, just use the standard OS commands instead. """ recursive = kwargs.get("R", kwargs.get("recursive", False)) if recursive: if local_path == "": local_path = "." output = self.execute_command(f'!find "{local_path}" -type f -exec realpath --relative-to="{local_path}" {{}} \\;') else: output = self.execute_command(f'!ls "{local_path}"', **kwargs) # Check if the output is a list if not isinstance(output, list): raise TypeError("Expected list, got {}".format(type(output))) return output
def cd(self, remote_path: str, **kwargs): self.execute_command(f'cd "{remote_path}"', output=False, **kwargs)
[docs] def pwd(self) -> str: """ Get the current remote directory using the LFTP command `pwd`. Returns: str: The current remote directory. """ output = self.execute_command("pwd") if isinstance(output, list) and len(output) == 1: return output[0] else: raise TypeError("Expected list of length 1, got {}: {}".format(type(output), output))
[docs] def lcd(self, local_path: str) -> str: """ Changes the current local directory using the LFTP command `lcd`. Args: local_path (str): The local directory to change to. """ self.execute_command(f"lcd {local_path}", output=False)
[docs] def lpwd(self) -> str: """ Get the current local directory using the LFTP command `lpwd`. Returns: str: The current local directory. """ output = self.execute_command("lpwd") if isinstance(output, list) and len(output) == 1: return output[0] else: raise TypeError("Expected list of length 1, got {}: {}".format(type(output), output))
def _get_current_files(self, dir_path: str) -> List[str]: return self.lls(dir_path, R="")
[docs] def mirror(self, remote_path: str, local_destination: str, blocking: bool=True, execute: bool=True, do_return: bool=True, **kwargs) -> Union[None, List[str]]: """ Download a directory from the remote directory to the given local destination using the LFTP mirror command. Args: remote_path (str): The remote directory to download. local_destination (str): The local destination to download the directory to. blocking (bool): If True, the function will block until the download is complete. execute (bool): If True, the function will execute the mirror command, otherwise it will return the command as a string. do_return (bool): If True, the function will return a list of the newly downloaded files. **kwargs: Keyword arguments to pass to the mirror command. Returns: Union[None, List[str]]: If do_return is True, the function will return a list of the newly downloaded files, otherwise it will return None. """ if do_return: # Capture the state of the directory before the operation pre_existing_files = self._get_current_files(local_destination) # Ensure that the pre_existing_files list is unique pre_existing_files = set(pre_existing_files) # Execute the mirror command default_args = {'P': 5, 'use-cache': None} exec_output = self.execute_command( f'mirror "{remote_path}" "{local_destination}"', output=blocking, blocking=blocking, execute=execute, default_args=default_args, **kwargs ) if not execute: return exec_output if do_return: # Capture the state of the directory after the operation post_download_files = self._get_current_files(local_destination) # Ensure that the post_download_files list is unique post_download_files = set(post_download_files) # Calculate the set difference to get the newly downloaded files new_files = post_download_files - pre_existing_files return list(new_files) else: return None
[docs] class IOHandler(ImplicitMount): """ This is a high-level wrapper for the ImplicitMount class, which provides human-friendly methods for downloading files from a remote directory without the need to have technical knowledge on how to use LFTP. OBS: The attributes of this method should not be used unless for development or advanced use cases, all responsibility in this case is on the user. Args: local_dir (str): The local directory to use for downloading files. If None, a temporary directory will be used (suggested, unless truly necessary). user_confirmation (bool): If True, the user will be asked for confirmation before deleting files. (strongly suggested for debugging and testing) clean (bool): If True, the local directory will be cleaned after the context manager is exited. (suggested, if not it may lead to rapid exhaustion of disk space) **kwargs: Keyword arguments to pass to the ImplicitMount constructor. .. <Sphinx comment Methods: download(): Download the given remote path to the given local destination. multi_download(): Download the given remote paths to the given local destinations. clone(): Clone the current remote directory to the given local destination. get_file_index(): Get a list of files in the current directory. cache_file_index(): Cache the file index for the current directory. store_last(): TODO: NOT IMPLEMENTED! Move the last downloaded file or directory to the given destination. clean(): Clean the local directory. clean_last(): Clean the last downloaded file or directory. .. Sphinx comment> """ def __init__(self, local_dir: Union[str, None]=None, user_confirmation: bool=False, clean: bool=False, **kwargs): super().__init__(**kwargs) if local_dir is None or local_dir == "": if self.default_config['local_dir'] is None or self.default_config['local_dir'] == "": local_dir = tempfile.TemporaryDirectory().name else: local_dir = self.default_config['local_dir'] if not os.path.exists(local_dir): os.makedirs(local_dir) self.original_local_dir = os.path.abspath(local_dir) self.local_dir = local_dir self.user_confirmation = user_confirmation self.do_clean = clean self.last_download = None self.last_type = None self.cache = {}
[docs] def lcd(self, local_path : str): self.local_dir = os.path.abspath(os.path.join(self.local_dir, local_path)) super().lcd(self.local_dir)
[docs] def lpwd(self): raise TypeError("'lpwd()' should not be used for 'IOHandler' objects, use the 'local_dir' attribute instead.")
def __enter__(self) -> "IOHandler": self.mount() self.lcd(self.local_dir) # Print local directory: # if the local directory is not specified in the config, # it is a temporary directory, so it is nice to know where it is located main_logger.debug(f"Local directory: {self.local_dir}") # Return self return self def __exit__(self, *args, **kwargs): # Positional and keyword arguments simply catch any arguments passed to the function to be ignored if self.do_clean: self.clean() self.unmount() # Methods for using the IOHandler without context management
[docs] def start(self) -> None: """ Initialize the connection to the remote directory. Very useful for interactive use, but shouldn't be used in scripts, using a context manager is safer and does the same. """ self.__enter__() main_logger.warning("IOHandler.start() is unsafe. Use IOHandler.__enter__() instead if possible.") main_logger.warning("OBS: Remember to call IOHandler.stop() when you are done.")
[docs] def stop(self) -> None: """ Close the connection to the remote directory. """ self.__exit__()
[docs] def download(self, remote_path: Union[str, List[str]], local_destination: Union[str, List[str], None]=None, blocking: bool=True, **kwargs) -> Union[str, List[str]]: """ Downloads one or more files or a directory from the remote directory to the given local destination. Args: remote_path (Union[str, List[str]]): The remote path(s) to download. local_destination (Union[str, List[str]]): The local destination to download the file(s) to. If None, the file(s) will be downloaded to the current local directory. blocking (bool): If True, the function will block until the download is complete. **kwargs: Extra keyword arguments are passed to the IOHandler.multi_download, IOHandler.pget or IOHandler.mirror functions depending on the type of the remote path(s). Returns: (List of) The local path of the downloaded file(s) or directory. """ # If multiple remote paths are specified, use multi_download instead of download, # this function is more flexible than mirror (works for files from different directories) and much faster than executing multiple pget commands if not isinstance(remote_path, str) and len(remote_path) > 1: return self.multi_download(remote_path, local_destination, **kwargs) if len(remote_path) == 1: remote_path = remote_path[0] if not isinstance(remote_path, str): raise TypeError("Expected str, got {}".format(type(remote_path))) if local_destination is None: local_destination = self.local_dir # Check if remote and local have file extensions: # The function assumes files have extensions and directories do not. remote_has_ext, local_has_ext = os.path.splitext(remote_path)[1] != "", os.path.splitext(local_destination)[1] != "" # If both remote and local have file extensions, the local destination should be a file path. if remote_has_ext and local_has_ext and os.path.isdir(local_destination): raise ValueError("Destination must be a file path if both remote and local have file extensions.") # If the remote does not have a file extension, the local destination should be a directory. if not remote_has_ext and not os.path.isdir(local_destination): raise ValueError("Destination must be a directory if remote does not have a file extension.") # Download cases; # If the remote is a single file, use pget. if remote_has_ext: local_result = self.pget(remote_path, local_destination, blocking, **kwargs) self.last_type = "file" # Otherwise use mirror. else: if not os.path.exists(local_destination): try: os.makedirs(local_destination) except FileExistsError: pass local_result = self.mirror(remote_path, local_destination, blocking, **kwargs) self.last_type = "directory" # TODO: Check local_result == local_destination (if it can be done in a relatively efficient way) # Store the last download for later use (nice for debugging) self.last_download = local_result # Return the local path of the downloaded file or directory return local_result
[docs] def multi_download(self, remote_paths: List[str], local_destination: str, blocking: bool=True, n: int=5, **kwargs) -> List[str]: """ Downloads a list of files from the remote directory to the given local destination. Args: remote_paths (List[str]): A list of remote paths to download. local_destination (str): The local destination to download the files to. If None, the files will be downloaded to the current local directory. blocking (bool): If True, the function will block until the download is complete. n (int): The number of connections to use for downloading each file. **kwargs: Extra keyword arguments are ignored. Returns: A list of the local paths of the downloaded files. """ # TODO: This function should really wrap an IOHandler.mget function, which should be implemented in the ImplicitMount class # Type checking and default argument configuration if not isinstance(remote_paths, list): raise TypeError("Expected list, got {}".format(type(remote_paths))) if not (isinstance(local_destination, str) or isinstance(local_destination, list) or local_destination is None): raise TypeError("Expected str or list, got {}".format(type(local_destination))) if isinstance(local_destination, str): os.makedirs(local_destination, exist_ok=True) elif local_destination is None: local_destination = self.local_dir local_files = [os.path.join(local_destination, os.path.basename(r)) for r in remote_paths] if len(remote_paths) != len(local_destination): raise ValueError("remote_paths and local_destination must have the same length.") if any([os.path.splitext(l)[1] != os.path.splitext(r)[1] for l, r in zip(local_destination, remote_paths)]): raise ValueError("Local and remote file extensions must match.") # Assemble the mget command, options and arguments multi_command = f'mget -O "{local_destination}" -P {n} ' + ' '.join([f'"{r}"' for r in remote_paths]) # Execute the mget command self.execute_command(multi_command, output=blocking, blocking=blocking) # Check if the files were downloaded TODO: is this too slow? Should we just assume that the files were downloaded for efficiency? for l in local_files: if not os.path.exists(l): raise RuntimeError(f"Failed to download {l}") # Store the last download for later use (nice for debugging) self.last_download = local_destination self.last_type = "multi" # Return the local paths of the downloaded files return local_files
[docs] def clone(self, local_destination: Union[None, str], blocking: bool=True, **kwargs) -> any: """ Clones the current remote directory to the given local destination. Args: local_destination (str): The local destination to clone the current remote directory to. blocking (bool): If True, the function will block until the download is complete. **kwargs: Keyword arguments to pass to the mirror function. Returns: The output of ImplicitMount.mirror, which depend on the arguments passed to the function. Most likely a list of the newly downloaded files. """ if not isinstance(local_destination, str) and local_destination is not None: raise TypeError("Expected str or None, got {}".format(type(local_destination))) if local_destination is None: local_destination = self.local_dir local_destination = os.path.abspath(local_destination + os.sep + self.pwd().split("/")[-1]) if not os.path.exists(local_destination): try: os.makedirs(local_destination) except FileExistsError: pass return self.mirror(".", local_destination, blocking, **kwargs)
[docs] def get_file_index(self, skip: int=0, nmax: Union[int, None]=None, override: bool=False, store: bool=True, pattern : Union[None, str]= None) -> List[str]: """ Get a list of files in the current remote directory. Args: skip (int): The number of files to skip. nmax (int): The maximum number of files to include. override (bool): If True, the file index will be overridden if it already exists. store (bool): If True, the file index will be stored on the remote directory. pattern (str): A regular expression pattern to filter the file names by, e.g. "\\.txt$" to only include files with the ".txt" extension. Returns: A list of files in the current remote directory. """ if override and not store: raise ValueError("override cannot be 'True' if store is 'False'!") # Check if file index exists glob_result = self.execute_command('glob -f --exist *folder_index.txt && echo "YES" || echo "NO"') if isinstance(glob_result, list) and len(glob_result) == 1: glob_result = glob_result[0] file_index_exists = glob_result == "YES" if not file_index_exists and self.verbose: main_logger.debug(f"Folder index does not exist in {self.pwd()}") # If override is True, delete the file index if it exists if override and file_index_exists: self.execute_command("rm folder_index.txt") # Now the file index does not exist (duh) file_index_exists = False # If the file index does not exist, create it if not file_index_exists: main_logger.debug("Creating folder index...") # Traverse the remote directory and write the file index to a file files = self.ls(recursive=True, use_cache=False, pbar=True) local_index_path = os.path.join(self.local_dir, "folder_index.txt") with open(local_index_path, "w") as f: for file in files: f.write(file + "\n") # Store the file index on the remote if 'store' is True, otherwise delete it if store: # Self has an implicit reference to the local working directory, however the scripts does not necessarily have the same working directory self.put("folder_index.txt") os.remove(local_index_path) # Download the file index if 'store' is True or it already exists on the remote, otherwise read it from the local directory if store or file_index_exists: file_index_path = self.download("folder_index.txt") else: file_index_path = local_index_path # Read the file index file_index = [] with open(file_index_path, "r") as f: for i, line in enumerate(f): if i < skip: continue if nmax is not None and i >= (skip + nmax): break if len(line) < 3 or "folder_index.txt" == line[:16]: continue if pattern is not None and re.search(pattern, line) is None: continue file_index.append(line.strip()) # Delete the file index if 'store' is True, otherwise return the path to the file index if not store: os.remove(file_index_path) return file_index
def cache_file_index(self, skip: int=0, nmax: Union[int, None]=None, override: bool=False) -> None: self.cache[self.io_handler.pwd()] = self.get_file_index(skip, nmax, override) def clean(self): if self.user_confirmation: # Ask for confirmation confirmation = input(f"Are you sure you want to delete all files in the current directory {self.local_dir}? (y/n)") if confirmation.lower() != "y": main_logger.debug("Aborted") return main_logger.debug("Cleaning up...") for path in os.listdir(self.local_dir): if os.path.isfile(path): try: os.remove(path) except: main_logger.debug("Error") try: shutil.rmtree(self.local_dir) except Exception as e: main_logger.error("Error while cleaning local backend directory!") files_in_dir = os.listdir(self.local_dir) if files_in_dir: n_files = len(files_in_dir) main_logger.error(f"{n_files} files in local directory ({self.local_dir}):") if n_files > 5: main_logger.error("\t" + "\n\t".join(files_in_dir[:5]) + "\n\t...") else: main_logger.error("\t" + "\n\t".join(files_in_dir)) raise e
[docs] class RemotePathIterator: """ This function provides a high-level buffered iterator for downloading files from a remote directory. All heavy computation is done in a separate thread, to avoid blocking the main thread unnecessarily. OBS: The attributes of this method should not be used unless for development or advanced use cases, all responsibility in this case is on the user. Args: io_handler (IOHandler): A backend object of class "IOHandler" to use for downloading files. batch_size (int): The number of files to download in each batch. Larger batches are more efficient, but may cause memory issues. batch_parallel (int): The number of files to download in parallel in each batch. Larger values may be more efficient, but can cause excessive loads on the remote server. max_queued_batches (int): The batches are processed sequentially from a queue, which is filled on request. This parameter specifies the maximum number of batches in the queue. Larger values can ensure a stable streaming rate, but may require more files to be stored locally. n_local_files (int): The number of files to store locally. OBS: This MUST be larger than batch_size * max_queued_batches (I suggest twice that), otherwise files may be deleted before they are consumed. clear_local (bool): If True, the local directory will be cleared after the iterator is stopped. **kwargs: Keyword arguments to pass to the IOHandler.get_file_index() function. Set 'store' to False to avoid altering the remote directory (this is much slower if you intent to use the iterator multiple times, however it may be necessary if the remote directory is read-only). PSA: If 'store' is False, 'override' must also be False. Yields: Tuple[str, str]: A tuple containing the local path and the remote path of the downloaded file. .. <Sphinx comment Methods: shuffle(): Shuffle the remote paths. subset(): Subset the remote paths. split(): Split the remote paths into multiple iterators, that share the same backend. These CANNOT be used in parallel. TODO: Can they be used concurrently? download_files(): Download files in batches. .. Sphinx comment> """ def __init__(self, io_handler: "IOHandler", batch_size: int=64, batch_parallel: int=10, max_queued_batches: int=3, n_local_files: int=2*3*64, clear_local: bool=False, **kwargs): self.io_handler = io_handler if self.io_handler.pwd() not in self.io_handler.cache: self.remote_paths = self.io_handler.get_file_index(**kwargs) else: if kwargs: main_logger.warning(f'Using cached file index. [{", ".join(kwargs.keys())}] will be ignored.') self.remote_paths = self.io_handler.cache[self.io_handler.pwd()] self.temp_dir = self.io_handler.local_dir self.batch_size = batch_size self.batch_parallel = batch_parallel self.max_queued_batches = max_queued_batches self.n_local_files = n_local_files if self.n_local_files < self.batch_size: main_logger.warning(f"n_local_files ({self.n_local_files}) is less than batch_size ({self.batch_size}). This may cause files to be deleted before they are consumed. Consider increasing n_local_files. Recommended value: {2 * self.batch_size * self.max_queued_batches}") self.download_queue = Queue() self.delete_queue = Queue() self.stop_requested = False self.not_cleaned = True self.clear_local = clear_local # State variables self.download_thread = None self.last_item = None self.last_batch_consumed = 0 self.consumed_files = 0 def __len__(self) -> int: return len(self.remote_paths)
[docs] def shuffle(self) -> None: """ Shuffle the remote paths. Shuffles the remote paths in-place. This function should not be called while iterating. """ if self.download_thread is not None: raise RuntimeError("Cannot shuffle while iterating.") shuffle(self.remote_paths)
[docs] def subset(self, indices: List[int]) -> None: """ Subset the remote paths. Subsets the remote paths in-place. This function should not be called while iterating. Args: indices (List[int]): A list of indices to keep. """ if self.download_thread is not None: raise RuntimeError("Cannot subset while iterating.") # TODO: It is fine that this works with a list of indices, but it should also work with a single index or a slice. self.remote_paths = [self.remote_paths[i] for i in indices]
[docs] def split(self, proportion: Union[float, None]=None, indices: Union[List[List[int]], None]=None) -> List["RemotePathIterator"]: """ Split the remote paths into multiple iterators, that share the same backend. These CANNOT be used in parallel. Either, but not both, of proportion and indices must be specified. Args: proportion (Union[float, None]): A list of proportions to split the remote paths into. If None, indices must be specified. indices (Union[List[List[int]], None]): A list of lists of indices to split the remote paths into. If None, proportion must be specified. Returns: List[RemotePathIterator]: A list of RemotePathIterator objects. """ if self.download_thread is not None: raise RuntimeError("Cannot split while iterating.") if proportion is None and indices is None: raise ValueError("Either proportion or indices must be specified.") if proportion is not None and indices is not None: raise ValueError("Only one of proportion or indices must be specified.") if proportion is not None: if not isinstance(proportion, list): raise TypeError("proportion must be a list.") if any([not isinstance(i, float) for i in proportion]): raise TypeError("All proportions must be floats.") if any([i < 0 or i > 1 for i in proportion]): raise ValueError("All proportions must be between 0 and 1.") if sum(proportion) != 1: proportion = [p / sum(proportion) for p in proportion] allocation = choices(list(range(len(proportion))), weights=proportion, k=len(self.remote_paths)) indices = [[] for _ in range(len(proportion))] for i, a in enumerate(allocation): indices[a].append(i) if indices is not None: if not isinstance(indices, list): raise TypeError("indices must be a list.") if any([not isinstance(i, list) for i in indices]): raise TypeError("indices must be a list of lists.") if any([any([not isinstance(j, int) for j in i]) for i in indices]): raise TypeError("indices must be a list of lists of ints.") if any([any([j < 0 or j >= len(self.remote_paths) for j in i]) for i in indices]): raise ValueError("indices must be a list of lists of ints in the range [0, len(remote_paths)).") if any([len(i) == 0 for i in indices]): raise ValueError("All indices must be non-empty.") else: raise RuntimeError("This should never happen.") iterators = [] for i in indices: this = RemotePathIterator(self.io_handler, batch_size=self.batch_size, batch_parallel=self.batch_parallel, max_queued_batches=self.max_queued_batches, n_local_files=self.n_local_files) this.subset(i) iterators.append(this) return iterators
[docs] def download_files(self): """ Download the entire list of remote paths in batches, and stores the local paths in a queue (self.download_queue). The function is not intended to be called directly, but there is no good reason why it should not and useful for debugging and testing. """ queued_batches = 0 for i in range(0, len(self.remote_paths), self.batch_size): if self.stop_requested: break while queued_batches >= self.max_queued_batches and not self.stop_requested: # Wait until a batch has been consumed (or multiple batches, if the consumer is fast and the producer is slow) before downloading another batch if self.last_batch_consumed > 0: self.last_batch_consumed -= 1 break time.sleep(0.2) # Wait until a batch has been consumed if self.stop_requested: break batch = self.remote_paths[i:i + self.batch_size] try: local_paths = self.io_handler.download(batch, n = self.batch_parallel) except Exception as e: main_logger.error(f"Failed to download batch {i} - {i + self.batch_size}: {e}") main_logger.warning("Skipping batch...") if not self.io_handler.lftp_shell is None and self.io_handler.lftp_shell.poll() is None: continue else: main_logger.error("LFTP shell died. Download thread killed.") break finally: for local_path, remote_path in zip(local_paths, batch): self.download_queue.put((local_path, remote_path)) queued_batches += 1
[docs] def start_download_queue(self) -> None: """ Creates a new thread and call the self.download_files() function in the new thread. Ensures non-blocking download of the files in self.remote_paths. """ self.download_thread = threading.Thread(target=self.download_files) self.download_thread.start()
def __iter__(self): # Force reset state self.not_cleaned = True self._cleanup() # Prepare state for iteration self.stop_requested = False self.not_cleaned = True # Start the download thread self.start_download_queue() # Main loop body try: for _ in range(len(self)): if self.stop_requested: break # Delete files if the queue is too large while self.delete_queue.qsize() > self.n_local_files: try: os.remove(self.delete_queue.get(timeout=1)) except Exception as e: main_logger.warning(f"Failed to remove file: {e}") # Get next item from queue or raise error if queue is empty try: if self.download_queue.empty() and not self.download_thread.is_alive(): self.stop_requested = True raise RuntimeError("Download thread died before iteration finished.") next_item : Tuple[str, str] = self.download_queue.get() # Timeout not applicable, since there is no guarantees on the size of the files or the speed of the connection # Update state to ensure that the producer keeps the queue prefilled # It is a bit complicated because the logic must be able to handle the case where the consumer is faster than the producer, # in this case the producer may be multiple batches behind the consumer. self.consumed_files += 1 if self.consumed_files >= self.batch_size: self.consumed_files -= self.batch_size self.last_batch_consumed += 1 except queue.Empty: # TODO: Can this happen? if self.stop_requested: break else: self.stop_requested = True raise RuntimeError("Download queue is empty but no stop was requested. Check the download thread.") finally: # Update state self.delete_queue.put(next_item[0]) # Return next item (local path, remote path => can be parsed to get the class label) yield next_item finally: self._cleanup() def _cleanup(self, force=False) -> None: if self.not_cleaned or force: # Force the iterator to stop if it is not already stopped self.stop_requested = True # Wait for the download thread to finish if self.download_thread is not None: self.download_thread.join(timeout=1) self.download_thread = None # Clean up the temporary directory while not self.download_queue.empty(): self.delete_queue.put(self.download_queue.get()[0]) while not self.delete_queue.empty(): f = self.delete_queue.get() try: if os.path.exists(f): os.remove(f) except Exception as e: main_logger.warning(f"Failed to remove file ({f}): {e}") # Remove any remaining files in the temporary directory if self.clear_local and os.path.exists(self.temp_dir): for f in os.listdir(self.temp_dir): if "folder_index.txt" in f: continue try: if os.path.exists(f): os.remove(f) except: main_logger.warning(f"Failed to remove file: {f}") pass ## TODO: DOUBLE CHECK - THIS SHOULD NOT BE NECESSARY (it is at the moment though!) # Check if the download thread is still running if self.download_thread is not None: main_logger.error("Download thread is still running. This should not happen.") self.stop_requested = True self.download_thread.join(timeout=1) self.download_thread = None # Check if the download queue is empty if not self.download_queue.empty(): main_logger.error("Download queue is not empty. This should not happen.") with self.download_queue.mutex: self.download_queue.queue.clear() # Check if the delete queue is empty if not self.delete_queue.empty() and self.clear_local: main_logger.error("Delete queue is not empty. This should not happen.") with self.download_queue.mutex: self.download_queue.queue.clear() self.not_cleaned = False else: main_logger.debug("Already cleaned up")