"""
epsman
Local python script for job packaging.
Can be called from Fabric for remote run case, only requires standard libs.
May be a better way to do this?
15/01/20 Change file type pattern matching from globPat to rePat - fixes bug with some files being ignored erroneously.
01/01/20 v1
"""
from zipfile import ZipFile
import zipfile
import os
import sys
from pathlib import Path
import glob
import re
import datetime
# Basic bytes to KB/Mb... conversion, from https://stackoverflow.com/questions/2104080/how-to-check-file-size-in-python
[docs]def convert_bytes(num):
"""
This function will convert bytes to MB.... GB... etc
"""
for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
if num < 1024.0:
# return "%3.1f %s" % (num, x)
return [num, x]
num /= 1024.0
# Define job root schema for file sorting
# Set here to allow for local function access & keep single set of definitions
[docs]def setJobRoot(nbFileName, jobSchema):
"""
Define job dir schema from processed notebook filename.
Parameters
----------
nbFileName : str or Path
Notebook file defining job.
# jobSchema : str
# - '2016' Jobs defined as mol/jName_XX-XXeV/
# - '2016sub' Jobs defined by eV, with subdirs, as mol/*_XX-XXeV/jName
# - '2019' Jobs defined as mol/jName/
# For 2019 schema, energies are interleaved, while for 2016 schema they are treated independently with different jobs.
25/07/20: testing new logic here, jobSchema currently bypassed.
For new style jobs, should add checking for job .conf file, rather than notebook name parsing.
"""
nbFileName = Path(nbFileName)
# OLD code, now possibly broken...?
# if jobSchema == '2016':
# jRoot = nbFileName.stem.rsplit(sep='_', maxsplit=3)
# return f"{jRoot[1]}_{jRoot[2]}_{jRoot[3]}"
# elif jobSchema == '2016sub':
# jRoot = nbFileName.stem.rsplit(sep='_', maxsplit=3) # TODO: THERE IS STILL SOMETHING STOOOPID GOING ON HERE WITH GLOBBING.
# # return f"{jRoot[1]}/.*{jRoot[2]}_{jRoot[3]}"
# # return f"{jRoot[1]}/{jRoot[2]}_{jRoot[3]}"
# # return f"{jRoot[1]}.*{jRoot[2]}_{jRoot[3]}"
# return f"{jRoot[0]}/*{jRoot[2]}_{jRoot[3]}"
# elif jobSchema == '2016sub-r': # 17/03/20 added l/r options, kept original for back-compatibility.
# jRoot = nbFileName.stem.rsplit(sep='_', maxsplit=2)
# return f"{jRoot[0]}.*{jRoot[1]}_{jRoot[2]}"
# elif jobSchema == '2016sub-l':
# jRoot = nbFileName.stem.split(sep='_', maxsplit=2)
# return f"{jRoot[0]}.*{jRoot[1]}_{jRoot[2]}"
# elif jobSchema == '2019':
# jRoot = nbFileName.stem.rsplit(sep='_', maxsplit=2)
# return f"{jRoot[1]}_{jRoot[2]}"
# else:
# return None # "Not supported"
# Updated 25/07/20. May have issues with new (2019) style jobs, but should be OK for all old flavours.
# Split at eV - this will give necessary parts for old-style jobs, and leave full name for new-style (TBT)
# E.g. for nbFileName = 'aniline_wf_0.1-1.1eV_orb26_B1tot.ipynb', jRoot = 'aniline_wf_0.1-1.1.*/.*orb26_B1tot'
jobParts = nbFileName.stem.split(sep='eV')
if len(jobParts)>1:
jRoot = f"{jobParts[0]}.*/.*{jobParts[-1].strip('_')}"
else:
jRoot = f"{jobParts[0]}" # For no eV case just return full job string - this probably will fail for 2019 jobs, should look for conf file instead, TBT.
return jRoot
#*** FOLLOWING COMMANDS TO RUN LOCALLY on host
# Build file list as local call - use OS calls.
# fileList = !shopt -s globstar; ls -d -1 '{nbProcDir.as_posix()}/'**/*[!zip] | grep {jRoot}
# fileList = os.system(f"shopt -s globstar; ls -d -1 '{nbProcDir.as_posix()}/'**/*[!zip] | grep {jRoot}")
# Python version
[docs]def getFilesPkg(pkgDir, globPat = r"/**/*", rePat = None, recursive=True):
"""
Glob pkgDir with globPat, and optional re matching with rePat.
Used for getting file lists for packaging ePS job dirs.
Parameters
----------
pkgDir : str or Path object
Directory to search.
globPat : str, optional, default = r"/**/*"
Default pattern for globbing, will search dir for all files.
Supports basic pattern matching, e.g. r"/**/*[!zip], but note glob matches chars - use re for more control.
rePat : str, optional, default = None
Regular expression for filtering glob output.
E.g. rePat = ".*substring.*" to search for 'substring' in glob output.
rePat = ".*substring.*$(?<zip)" to exclude zip files.
recursive : bool, optional, default = True
Recursive glob: if True, search subdirs too (with ** pattern).
"""
# Get file list using recursive glob and supplied pattern
fileList = glob.glob(f"{pkgDir}{globPat}", recursive=recursive)
# If supplied, process with re
if rePat is not None:
fileListRe = []
for item in fileList:
if re.search(rePat, item):
fileListRe.append(item)
else:
fileListRe = fileList
return fileListRe
# THIS IS NOT REQUIRED - just call this script with single file.
# See _repo.updateArch()
#
# def addArchFile(archName, fileIn, cType = zipfile.ZIP_LZMA):
# """Add single file to an archive"""
# # Open archive & write files
# with ZipFile(archName, 'w', compression=cType) as pkgZip:
# # Write file, set also arcname to fix relative paths
# pkgZip.write(fileIn, arcname = Path(fileIn).relative_to(pkgDir))
#
# # Check file is OK
# if pkgZip.testzip() is None:
# # zipList.append(archName)
# print(f'Written {archName} OK')
# return True
# else:
# # failList.append(archName)
# print(f'*** Archive {archName} failed')
# return False
[docs]def buildPkg(archName, fileList, pkgDir, archMode = 'w', cType = zipfile.ZIP_LZMA):
"""Build pkg zip from fileList
Parameters
----------
archName : str or Path object
Archive to write.
fileList : list of strings or Path objects
Files to include (full paths)
pkgDir : str or Path object
Directory to use as root in archive
archMode : char, optional, default = 'w'
Set to 'w'rite or 'a'ppend to existing archive.
cType : int, default = zipfile.ZIP_LZMA (=14)
Compression level.
TODO:
- Check if arch exists for 'w' case?
- File size checks to add?
- Summary for files & dirs, and verbosity level.
"""
# Set variable for file additions
dupList = []
# Create archive & write files
with ZipFile(archName, archMode, compression=cType) as pkgZip:
for fileIn in fileList:
dupPath = None
# Test & set relative paths for file in archive - may fail in some cases if path root is different.
try:
arcFile = Path(fileIn).relative_to(pkgDir)
except ValueError:
arcFile = Path(fileIn).name # In this case just take file name, will go in archive root
# Add file to existing arch, check file exists first.
if archMode == 'a':
# Check all files in arch, names only.
for fileName in pkgZip.namelist():
# if Path(fileName).name == Path(fileIn).name: # May want to use Path(fileIn).relative_to(pkgDir) here for consistency and to allow subdirs with same files?
# if Path(fileIn).relative_to(pkgDir) == Path(fileName):
if arcFile == Path(fileName):
dupPath = fileName
dupList.append([fileIn, fileName])
#
# if (fileIn[1:] in pkgZip.namelist()): # FILE MISSING initial / in .namelist(). This only works if full paths preserved/identical
if dupPath is not None:
print(f'File: {fileIn} already in archive as {fileName}.')
else:
pkgZip.write(fileIn, arcname = arcFile)
else:
# Write file, set also arcname to fix relative paths
pkgZip.write(fileIn, arcname = arcFile)
# Check file is OK
if (pkgZip.testzip() is None) and (not dupList):
# zipList.append(archName)
print(f'Written {archName} OK')
fSize = convert_bytes(Path(archName).stat().st_size)
print(f"{round(fSize[0],2)} {fSize[1]}")
if (fSize[1] == 'GB') or (fSize[1] == 'TB'):
print("***LARGE FILE")
return True
elif dupList:
print(f'Skipped duplicate files (new, in arch):')
print(*dupList, sep='\n')
return False
else:
# failList.append(archName)
print(f'*** Archive {archName} failed')
return False
# Additional code for checking archives.
[docs]def checkArch(archName):
"""Test archive & return info if OK"""
infoList = None
nameList = None
with ZipFile(archName, 'r') as checkZip:
if checkZip.testzip() is None:
infoList = checkZip.infolist() # Get info & file list
nameList = checkZip.namelist()
return infoList, nameList
# Code for CLI call from Fabric
# Args: pkgDir, dryRun, archName, jobSchema, jRoot
# If jRoot is not passed, pkg a directory, otherwise pkg single job as defined.
# If jRoot is a file, then add this to archive, otherwise search for files based on jRoot.
# For jRoot case jobSchema is not used, but currently setting method by len(sys.argv), so required.
# TODO: better logic here!
# TODO: consolidate rePat for standard file inclusion
# TODO: white list for pkg file inclusion? rePat is getting silly now.
# NOTE: for no jRoot case, rePat = f".*{jobSchema def}.*$(?<!zip)(?<!ipynb)"
# Otherwise rePat = f".*{jRoot}.*" for basic substring match.
if __name__ == "__main__":
# Passed args - this is root dir containing notebooks + ePS output subdirs.
pkgDir = Path(sys.argv[1])
jobSchema = sys.argv[4]
# print(len(sys.argv))
# print((sys.argv))
# Set for dryRun - this will only display pkgs to be built.
if sys.argv[2] == 'True':
dryRun = True
print("\n***Archive dry run")
else:
dryRun = False
# Print header lines for job, will be in log file.
print("\n***Writing archives")
print(f"nbProcDir: {pkgDir}")
print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M") + '\n')
# print(sys.argv)
# print(len(sys.argv))
# Set default to write/overwrite archive
archMode = 'w'
# If args are passed, build archive for single job
if len(sys.argv) > 5:
jRoot = sys.argv[5]
archName = Path(sys.argv[3])
if Path(jRoot).is_file():
# If a single file is passed, set for this file only.
fileList = [jRoot]
rePat = None
archMode = 'a' # Set to append to existing archive
print(f'Appending file: {jRoot}')
elif Path(jRoot).is_dir():
print(f'Skipping dir: {jRoot}')
else:
# If a pattern is passed, create file list for pkg
# NOTE: currently set to ignore zip and ipynb files for later inclusion via single-file calls.
# Also skips files of type .zNN, which are multipart zip files.
# rePat = f".*{jRoot}.*$(?<!zip)(?<!z[0-9][0-9])(?<!ipynb)(?<!sh)(?<!sh~)"
rePat = f".*{jRoot}(?!-).*$(?<!zip)(?<!z[0-9][0-9])(?<!ipynb)(?<!sh)(?<!sh~)" # 25/07/20: added (?!-) to ignore excited state cases with orbXX-XX
fileList = getFilesPkg(pkgDir, rePat = rePat)
# fileList
# Write zip
if not dryRun:
buildPkg(archName, fileList, pkgDir, archMode = archMode)
else:
print('\n***Pkg dry run')
# print(f"Job: {item}")
print(f"Arch: {archName}")
print(f"jRoot: {jRoot}")
print(f"rePat: {rePat}")
print(*fileList, sep='\n')
# Otherwise package full dir based on notebooks in root.
else:
archDir = Path(sys.argv[3])
# Create notebook file list for pkg
rePat = ".ipynb$"
nbFileList = getFilesPkg(pkgDir, globPat = r"/**/*", rePat = rePat)
if dryRun:
print("\n***Notebook file list:")
print(*nbFileList, sep='\n')
zipList = []
failList = []
for item in nbFileList: # Local file list
# Job keys
item = Path(item)
# jRoot = item.stem.rsplit(sep='_', maxsplit=2)
jRoot = setJobRoot(item, jobSchema)
archName = Path(archDir, item.stem + '.zip')
# Create file list for pkg
# rePat = f".*{jRoot}.*"
# rePat = f".*{jRoot}.*$(?<!zip)(?<!ipynb)" # Use this for file end exclusion, rather than glob, from https://stackoverflow.com/a/10055688
# Also skips files of type .zNN, which are multipart zip files.
# rePat = f".*{jRoot}.*$(?<!zip)(?<!z[0-9][0-9])(?<!ipynb)(?<!sh)(?<!sh~)"
rePat = f".*{jRoot}(?!-).*$(?<!zip)(?<!z[0-9][0-9])(?<!ipynb)(?<!sh)(?<!sh~)" # 25/07/20: added (?!-) to ignore excited state cases with orbXX-XX
fileList = getFilesPkg(pkgDir, rePat = rePat)
# Write zip
if not dryRun:
test = buildPkg(archName, fileList, pkgDir, archMode = archMode)
if test:
zipList.append(archName)
else:
failList.append(archName)
else:
print('\n***Pkg dry run')
print(f"Job: {item}")
print(f"Arch: {archName}")
print(f"jRoot: {jRoot}")
print(f"rePat: {rePat}")
print(*fileList, sep='\n')
if not dryRun:
print(f'\nArchives completed at {datetime.datetime.now().strftime("%Y-%m-%d %H:%M")}\n')
# TODO: Additional print/pass/write results here?
# Currently included print() statements from buildPkg()