Source code for epsman.repo.remoteUpload

"""
epsman

Local python script for job upload to repo.

Can be called from Fabric for remote run case, only requires standard libs + local nbDetails file defining uploads.

Currently duplicates some functions in _repo.py, in stripped-down form.

20/01/20    Testing for Zenodo uploads.  Issue with files >100Mb... drops out with errors, but not messages.

09/01/20    v1

"""

from pathlib import Path
import json
import sys
import pprint
import requests
import os
import glob

#from epsman.repo.pkgFiles import convert_bytes
# Basic bytes to KB/Mb... conversion, from https://stackoverflow.com/questions/2104080/how-to-check-file-size-in-python
[docs]def convert_bytes(num):
    """
    This function will convert bytes to MB.... GB... etc
    """
    for x in ['bytes', 'KB', 'MB']:  # , 'GB', 'TB']:  Keep to MB in this case.
        if num < 1024.0:
#             return "%3.1f %s" % (num, x)
            return [num, x]
        num /= 1024.0


[docs]def readNBdetailsJSON(jsonProcFile):
    """Read previously written nbDetails dictionary from JSON file.

    See _repo.readNBdetailsJSON() for local version.

    """

    print(f"***Reading local JSON file {jsonProcFile}")

    if Path(jsonProcFile).is_file():

        # Read local JSON file.
        with open(jsonProcFile, 'r') as f:
            nbDetails = json.load(f)

        return nbDetails

    else:
        print('File not found.')
        return None


[docs]def splitArchFiles(nbDetails, key, dryRun = True, chunk = 90, verbose = True):
    """
    Basic routine to split existing archive files into chunks for upload.

    Split existing archives into size = chunk (MB) files using system zip package.
    See, e.g., https://serverfault.com/questions/760337/how-to-zip-files-with-a-size-limit/760341

    TODO: replace with better logic...?  Package files for E sets to avoid large archives? Use Tar?

    """

    arch = Path(nbDetails[key]['archName'])
    archSize = convert_bytes(os.stat(arch).st_size)

    if ('MB' in archSize[1]) and (archSize[0] > chunk):
        print(f"***File greater than {chunk}Mb: {arch}")
        print(f"Splitting into chunks...")

        # Set new file name for chunked arch
        fileOut = arch.with_name(arch.stem + '_multiPart.zip')

        # Just need to run this at command line on remote (Linux)
        # TODO: add some checking logic here, at the moment can fail if fileOut already exists.
        # Note -j for junking the path, see http://manpages.ubuntu.com/manpages/precise/man1/zip.1.html
        os.system(f"zip -j -s {chunk}m {fileOut} {arch}")

        # TO REBUILD:
        # zip -s 0 testMultipart.zip --out testRecon.zip

        # Get filelist of archive parts
        fileList = glob.glob(Path(fileOut.parent, fileOut.stem).as_posix() + '*')

        # THEN - update repoFile list with parts
#        updatedList = [item for item in nbDetails[key]['repoFiles'] if (item not in arch.as_posix())]
        updatedList = [item for item in nbDetails[key]['repoFiles'] if (item != arch.as_posix())]

        for item in fileList:
            if not (item in updatedList):
                updatedList.append(item)

        nbDetails[key]['repoFiles'] = updatedList

        if verbose or dryRun:
            print("Updated repoFiles list with multipart archive.")
            print(*updatedList, sep="\n")


[docs]def uploadRepoFiles(nbDetails, key, ACCESS_TOKEN, dryRun = True):
    """Upload files to repo (from local machine)

    See _repo.uploadRepoFiles() for local version.

    """

    url = f"https://zenodo.org/api/deposit/depositions/{nbDetails[key]['repoInfo']['id']}/files?access_token={ACCESS_TOKEN}"

    outputs = []
    for fileIn in nbDetails[key]['repoFiles']:
        data = {'name': Path(fileIn).name}
        files = {'file': open(fileIn, 'rb')}

        if not dryRun:
            r = requests.post(url, data=data, files=files)

            if r.ok:
                print(f"File upload OK: {fileIn}")
            elif r.json()['status'] == 400:
                print(f"File already on server: : {fileIn}")
            else:
                print(f"File upload failed: {fileIn}")
                # print(r.json())

            outputs.append([r.ok, r.json()])

        else:
            print("Dry run only...")
            print(f"URL: {url}")
            print(f"data: {data}")
            print(f"File: {fileIn}")
            outputs.append([url, data, files])

    # nbDetails[key]['repoFilesUpload'] = outputs
    # return nbDetails
    return outputs

[docs]def writeNBdetailsJSON(jsonProcFile, nbDetails):
    """Write nbDetails dictionary to JSON file.

    See _repo.writeNBdetailsJSON() for local version.
    """

    # Write to json file
    # Write to JSON.  Note Path() objects won't serialize.
    with open(jsonProcFile, 'w') as f:
        json.dump(nbDetails, f, indent=2)

    print(f'\n***nbDetails written to local JSON file: {jsonProcFile}')


# If running as main, take passed args and run functions.
# TODO: add log file per job writing here?
if __name__ == "__main__":
    # Passed args
    jsonProcFile = sys.argv[1]
    ACCESS_TOKEN = sys.argv[2]
    verbose = True
    dryRun = False

    print("***Uploading to repo.")

    # Get details from file
    nbDetails = readNBdetailsJSON(jsonProcFile)

    if verbose:
        pprint.pprint(nbDetails)

    # Upload files & log result.
    for key in nbDetails:
        if key!='proc' and nbDetails[key]['pkg'] and nbDetails[key]['archFilesOK']:
            splitArchFiles(nbDetails, key, dryRun = dryRun, verbose = verbose)
            nbDetails[key]['repoFilesUpload'] = uploadRepoFiles(nbDetails, key, ACCESS_TOKEN, dryRun=dryRun)
        else:
            print(f"***Skipping item {key} upload")

    # Write to new JSON file
    if not dryRun:
        print(f"\nWriting log file {jsonProcFile + '.upload'}")
        writeNBdetailsJSON(jsonProcFile + '.upload', nbDetails)

    print("***Uploads completed.")