Source code for epsman.repo.pkgFiles

"""
epsman

Local python script for job packaging.

Can be called from Fabric for remote run case, only requires standard libs.

May be a better way to do this?

15/01/20    Change file type pattern matching from globPat to rePat - fixes bug with some files being ignored erroneously.

01/01/20    v1

"""

from zipfile import ZipFile
import zipfile
import os
import sys
from pathlib import Path
import glob
import re
import datetime


# Basic bytes to KB/Mb... conversion, from https://stackoverflow.com/questions/2104080/how-to-check-file-size-in-python
[docs]def convert_bytes(num):
    """
    This function will convert bytes to MB.... GB... etc
    """
    for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
        if num < 1024.0:
#             return "%3.1f %s" % (num, x)
            return [num, x]
        num /= 1024.0


# Define job root schema for file sorting
# Set here to allow for local function access & keep single set of definitions
[docs]def setJobRoot(nbFileName, jobSchema):
    """
    Define job dir schema from processed notebook filename.

    Parameters
    ----------
    nbFileName : str or Path
        Notebook file defining job.

    # jobSchema : str
    #     - '2016' Jobs defined as mol/jName_XX-XXeV/
    #     - '2016sub' Jobs defined by eV, with subdirs, as mol/*_XX-XXeV/jName
    #     - '2019' Jobs defined as mol/jName/
    #     For 2019 schema, energies are interleaved, while for 2016 schema they are treated independently with different jobs.

    25/07/20: testing new logic here, jobSchema currently bypassed.

    For new style jobs, should add checking for job .conf file, rather than notebook name parsing.

    """

    nbFileName = Path(nbFileName)

    # OLD code, now possibly broken...?
    # if jobSchema == '2016':
    #     jRoot = nbFileName.stem.rsplit(sep='_', maxsplit=3)
    #     return f"{jRoot[1]}_{jRoot[2]}_{jRoot[3]}"
    # elif jobSchema == '2016sub':
    #     jRoot = nbFileName.stem.rsplit(sep='_', maxsplit=3)  # TODO: THERE IS STILL SOMETHING STOOOPID GOING ON HERE WITH GLOBBING.
    #     # return f"{jRoot[1]}/.*{jRoot[2]}_{jRoot[3]}"
    #     # return f"{jRoot[1]}/{jRoot[2]}_{jRoot[3]}"
    #     # return f"{jRoot[1]}.*{jRoot[2]}_{jRoot[3]}"
    #     return f"{jRoot[0]}/*{jRoot[2]}_{jRoot[3]}"
    # elif jobSchema == '2016sub-r':  # 17/03/20 added l/r options, kept original for back-compatibility.
    #     jRoot = nbFileName.stem.rsplit(sep='_', maxsplit=2)
    #     return f"{jRoot[0]}.*{jRoot[1]}_{jRoot[2]}"
    # elif jobSchema == '2016sub-l':
    #     jRoot = nbFileName.stem.split(sep='_', maxsplit=2)
    #     return f"{jRoot[0]}.*{jRoot[1]}_{jRoot[2]}"
    # elif jobSchema == '2019':
    #     jRoot = nbFileName.stem.rsplit(sep='_', maxsplit=2)
    #     return f"{jRoot[1]}_{jRoot[2]}"
    # else:
    #     return None # "Not supported"

    # Updated 25/07/20. May have issues with new (2019) style jobs, but should be OK for all old flavours.
    # Split at eV - this will give necessary parts for old-style jobs, and leave full name for new-style (TBT)
    # E.g. for nbFileName = 'aniline_wf_0.1-1.1eV_orb26_B1tot.ipynb', jRoot = 'aniline_wf_0.1-1.1.*/.*orb26_B1tot'
    jobParts = nbFileName.stem.split(sep='eV')
    if len(jobParts)>1:
        jRoot = f"{jobParts[0]}.*/.*{jobParts[-1].strip('_')}"
    else:
        jRoot = f"{jobParts[0]}"  # For no eV case just return full job string - this probably will fail for 2019 jobs, should look for conf file instead, TBT.

    return jRoot


#*** FOLLOWING COMMANDS TO RUN LOCALLY on host


# Build file list as local call - use OS calls.
# fileList = !shopt -s globstar; ls -d -1 '{nbProcDir.as_posix()}/'**/*[!zip]  | grep {jRoot}
# fileList = os.system(f"shopt -s globstar; ls -d -1 '{nbProcDir.as_posix()}/'**/*[!zip]  | grep {jRoot}")

# Python version

[docs]def getFilesPkg(pkgDir, globPat = r"/**/*", rePat = None, recursive=True):
    """
    Glob pkgDir with globPat, and optional re matching with rePat.

    Used for getting file lists for packaging ePS job dirs.

    Parameters
    ----------
    pkgDir : str or Path object
        Directory to search.

    globPat : str, optional, default = r"/**/*"
        Default pattern for globbing, will search dir for all files.
        Supports basic pattern matching, e.g. r"/**/*[!zip], but note glob matches chars - use re for more control.

    rePat : str, optional, default = None
        Regular expression for filtering glob output.
        E.g. rePat = ".*substring.*"  to search for 'substring' in glob output.
        rePat = ".*substring.*$(?<zip)" to exclude zip files.

    recursive : bool, optional, default = True
        Recursive glob: if True, search subdirs too (with ** pattern).

    """
    # Get file list using recursive glob and supplied pattern
    fileList = glob.glob(f"{pkgDir}{globPat}", recursive=recursive)

    # If supplied, process with re
    if rePat is not None:
        fileListRe = []
        for item in fileList:
            if re.search(rePat, item):
                fileListRe.append(item)

    else:
        fileListRe = fileList

    return fileListRe

# THIS IS NOT REQUIRED - just call this script with single file.
# See _repo.updateArch()
#
# def addArchFile(archName, fileIn, cType = zipfile.ZIP_LZMA):
#     """Add single file to an archive"""
#     # Open archive & write files
#     with ZipFile(archName, 'w', compression=cType) as pkgZip:
#         # Write file, set also arcname to fix relative paths
#         pkgZip.write(fileIn, arcname = Path(fileIn).relative_to(pkgDir))
#
#         # Check file is OK
#         if pkgZip.testzip() is None:
#             # zipList.append(archName)
#             print(f'Written {archName} OK')
#             return True
#         else:
#             # failList.append(archName)
#             print(f'*** Archive {archName} failed')
#             return False


[docs]def buildPkg(archName, fileList, pkgDir, archMode = 'w', cType = zipfile.ZIP_LZMA):
    """Build pkg zip from fileList

    Parameters
    ----------
    archName : str or Path object
        Archive to write.

    fileList : list of strings or Path objects
        Files to include (full paths)

    pkgDir : str or Path object
        Directory to use as root in archive

    archMode : char, optional, default = 'w'
        Set to 'w'rite or 'a'ppend to existing archive.

    cType : int, default = zipfile.ZIP_LZMA (=14)
        Compression level.

    TODO:
    - Check if arch exists for 'w' case?
    - File size checks to add?
    - Summary for files & dirs, and verbosity level.

    """
    # Set variable for file additions
    dupList = []

    # Create archive & write files
    with ZipFile(archName, archMode, compression=cType) as pkgZip:
        for fileIn in fileList:
            dupPath = None

            # Test & set relative paths for file in archive - may fail in some cases if path root is different.
            try:
                arcFile = Path(fileIn).relative_to(pkgDir)
            except ValueError:
                arcFile = Path(fileIn).name  # In this case just take file name, will go in archive root

            # Add file to existing arch, check file exists first.
            if archMode == 'a':
                # Check all files in arch, names only.
                for fileName in pkgZip.namelist():
                    # if Path(fileName).name == Path(fileIn).name:  # May want to use Path(fileIn).relative_to(pkgDir) here for consistency and to allow subdirs with same files?
                    # if Path(fileIn).relative_to(pkgDir) == Path(fileName):
                    if arcFile == Path(fileName):
                        dupPath = fileName
                        dupList.append([fileIn, fileName])
                #
                # if (fileIn[1:] in pkgZip.namelist()):  # FILE MISSING initial / in .namelist(). This only works if full paths preserved/identical
                if dupPath is not None:
                    print(f'File: {fileIn} already in archive as {fileName}.')
                else:
                    pkgZip.write(fileIn, arcname = arcFile)

            else:
                # Write file, set also arcname to fix relative paths
                pkgZip.write(fileIn, arcname = arcFile)

        # Check file is OK
        if (pkgZip.testzip() is None) and (not dupList):
            # zipList.append(archName)
            print(f'Written {archName} OK')
            fSize = convert_bytes(Path(archName).stat().st_size)
            print(f"{round(fSize[0],2)} {fSize[1]}")
            if (fSize[1] == 'GB') or (fSize[1] == 'TB'):
                print("***LARGE FILE")
            return True
        elif dupList:
            print(f'Skipped duplicate files (new, in arch):')
            print(*dupList, sep='\n')
            return False
        else:
            # failList.append(archName)
            print(f'*** Archive {archName} failed')
            return False


# Additional code for checking archives.
[docs]def checkArch(archName):
    """Test archive & return info if OK"""

    infoList = None
    nameList = None

    with ZipFile(archName, 'r') as checkZip:
        if checkZip.testzip() is None:
            infoList = checkZip.infolist()  # Get info & file list
            nameList = checkZip.namelist()

    return infoList, nameList


# Code for CLI call from Fabric
# Args: pkgDir, dryRun, archName, jobSchema, jRoot
# If jRoot is not passed, pkg a directory, otherwise pkg single job as defined.
# If jRoot is a file, then add this to archive, otherwise search for files based on jRoot.
# For jRoot case jobSchema is not used, but currently setting method by len(sys.argv), so required.
# TODO: better logic here!
# TODO: consolidate rePat for standard file inclusion
# TODO: white list for pkg file inclusion?  rePat is getting silly now.
# NOTE: for no jRoot case, rePat = f".*{jobSchema def}.*$(?<!zip)(?<!ipynb)"
#        Otherwise rePat = f".*{jRoot}.*" for basic substring match.
if __name__ == "__main__":

    # Passed args - this is root dir containing notebooks + ePS output subdirs.
    pkgDir = Path(sys.argv[1])
    jobSchema = sys.argv[4]
    # print(len(sys.argv))
    # print((sys.argv))

    # Set for dryRun - this will only display pkgs to be built.
    if sys.argv[2] == 'True':
        dryRun = True
        print("\n***Archive dry run")
    else:
        dryRun = False
        # Print header lines for job, will be in log file.
        print("\n***Writing archives")
        print(f"nbProcDir: {pkgDir}")
        print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M") + '\n')

    # print(sys.argv)
    # print(len(sys.argv))

    # Set default to write/overwrite archive
    archMode = 'w'

    # If args are passed, build archive for single job
    if len(sys.argv) > 5:
        jRoot = sys.argv[5]
        archName = Path(sys.argv[3])

        if Path(jRoot).is_file():
            # If a single file is passed, set for this file only.
            fileList = [jRoot]
            rePat = None
            archMode = 'a'  # Set to append to existing archive
            print(f'Appending file: {jRoot}')
        elif Path(jRoot).is_dir():
            print(f'Skipping dir: {jRoot}')
        else:
            # If a pattern is passed, create file list for pkg
            # NOTE: currently set to ignore zip and ipynb files for later inclusion via single-file calls.
            # Also skips files of type .zNN, which are multipart zip files.
            # rePat = f".*{jRoot}.*$(?<!zip)(?<!z[0-9][0-9])(?<!ipynb)(?<!sh)(?<!sh~)"
            rePat = f".*{jRoot}(?!-).*$(?<!zip)(?<!z[0-9][0-9])(?<!ipynb)(?<!sh)(?<!sh~)"  # 25/07/20: added (?!-) to ignore excited state cases with orbXX-XX
            fileList = getFilesPkg(pkgDir, rePat = rePat)
            # fileList

        # Write zip
        if not dryRun:
            buildPkg(archName, fileList, pkgDir, archMode = archMode)
        else:
            print('\n***Pkg dry run')
            # print(f"Job: {item}")
            print(f"Arch: {archName}")
            print(f"jRoot: {jRoot}")
            print(f"rePat: {rePat}")
            print(*fileList, sep='\n')

    # Otherwise package full dir based on notebooks in root.
    else:
        archDir = Path(sys.argv[3])

        # Create notebook file list for pkg
        rePat = ".ipynb$"
        nbFileList = getFilesPkg(pkgDir, globPat = r"/**/*", rePat = rePat)

        if dryRun:
            print("\n***Notebook file list:")
            print(*nbFileList, sep='\n')

        zipList = []
        failList = []
        for item in nbFileList:  # Local file list

            # Job keys
            item = Path(item)
            # jRoot = item.stem.rsplit(sep='_', maxsplit=2)
            jRoot = setJobRoot(item, jobSchema)
            archName = Path(archDir, item.stem + '.zip')

            # Create file list for pkg
            # rePat = f".*{jRoot}.*"
            # rePat = f".*{jRoot}.*$(?<!zip)(?<!ipynb)"  # Use this for file end exclusion, rather than glob, from https://stackoverflow.com/a/10055688
            # Also skips files of type .zNN, which are multipart zip files.
            # rePat = f".*{jRoot}.*$(?<!zip)(?<!z[0-9][0-9])(?<!ipynb)(?<!sh)(?<!sh~)"
            rePat = f".*{jRoot}(?!-).*$(?<!zip)(?<!z[0-9][0-9])(?<!ipynb)(?<!sh)(?<!sh~)"  # 25/07/20: added (?!-) to ignore excited state cases with orbXX-XX
            fileList = getFilesPkg(pkgDir, rePat = rePat)

            # Write zip
            if not dryRun:
                test = buildPkg(archName, fileList, pkgDir, archMode = archMode)

                if test:
                    zipList.append(archName)
                else:
                    failList.append(archName)

            else:
                print('\n***Pkg dry run')
                print(f"Job: {item}")
                print(f"Arch: {archName}")
                print(f"jRoot: {jRoot}")
                print(f"rePat: {rePat}")
                print(*fileList, sep='\n')



        if not dryRun:
            print(f'\nArchives completed at {datetime.datetime.now().strftime("%Y-%m-%d %H:%M")}\n')
            # TODO: Additional print/pass/write results here?
            # Currently included print() statements from buildPkg()