2 added + 7 modified, total 9 files
java/sandbox/data-cat/src/main/python/hpsdatacat
--- java/sandbox/data-cat/src/main/python/hpsdatacat/__init__.py 2014-06-04 02:03:54 UTC (rev 662)
+++ java/sandbox/data-cat/src/main/python/hpsdatacat/__init__.py 2014-06-04 02:04:40 UTC (rev 663)
@@ -0,0 +1,3 @@
+"""
+@author: Jeremy McCormick <[log in to unmask]>
+"""
\ No newline at end of file
java/sandbox/data-cat/src/main/python/hpsdatacat
--- java/sandbox/data-cat/src/main/python/hpsdatacat/add_location.py 2014-06-04 02:03:54 UTC (rev 662)
+++ java/sandbox/data-cat/src/main/python/hpsdatacat/add_location.py 2014-06-04 02:04:40 UTC (rev 663)
@@ -1,27 +1,15 @@
+#!/usr/bin/env python
+
"""
-Wrapper for 'addLocation' command.
+Add an additional location for a dataset using the 'addLocation' command.
-The help from that command is the following:
-
-Usage: datacat addLocation [-options] <dataset name> <logical folder> <file path>
-
-parameters:
- <dataset name> Tag-name for the new dataset.
- <logical folder> Logical Folder Path under which the dataset lives.
- <file path> Location of file to add to Data Catalog.
-
-options:
- --group <Dataset Group> Dataset Group under which the dataset lives.
- --site <Site=SLAC> Site at which file exists on disk. Defaults to SLAC if not specified.
- --version <Version=-1> Version ID of the dataset (Defaults to the Latest version if not specified.)
-
Example command:
python ./src/main/python/hpsdatacat/add_location.py -n hps_testrun_001351_recon -f \
/HPS/testrun2012/data/recon -p /nfs/slac/g/hps3/data/datacat-test/data/hps_testrun_001351_test2.slcio \
-s JLAB
-FIXME: Above command doesn't work right now!
+FIXME: Above command doesn't work right now because JLAB isn't a valid site!
"""
java/sandbox/data-cat/src/main/python/hpsdatacat
--- java/sandbox/data-cat/src/main/python/hpsdatacat/add_metadata.py 2014-06-04 02:03:54 UTC (rev 662)
+++ java/sandbox/data-cat/src/main/python/hpsdatacat/add_metadata.py 2014-06-04 02:04:40 UTC (rev 663)
@@ -1,7 +1,7 @@
+#!/usr/bin/env python
+
"""
-Script for setting meta data on an existing dataset or group.
-This script cannot be currently used to set a new value on an
-existing meta data field.
+Set meta data on an existing dataset or group using the 'addMetaData' command.
"""
import argparse
@@ -13,10 +13,10 @@
# create the argparser
parser = create_base_argparser(__command)
-parser.add_argument('-f', '--folder', help='folder where dataset lives')
-parser.add_argument('-d', '--dataset', help='target dataset for meta data')
-parser.add_argument('-v', '--version', help='version ID of the dataset (defaults to latest)')
-parser.add_argument('-g', '--group', help='dataset group or group to tag when no dataset specified')
+parser.add_argument('-f', '--folder', help='folder where local_file lives')
+parser.add_argument('-d', '--local_file', help='target local_file for meta data')
+parser.add_argument('-v', '--version', help='version ID of the local_file (defaults to latest)')
+parser.add_argument('-g', '--group', help='local_file group or group to tag when no local_file specified')
parser.add_argument('-m', '--metadata', nargs='*', help='a single meta data definition')
args = vars(parser.parse_args())
@@ -29,11 +29,11 @@
if connection_string == None:
raise Exception("Couldn't figure out a connection string to use!")
-# dataset
-if args['dataset'] != None:
- dataset = args['dataset']
+# local_file
+if args['local_file'] != None:
+ local_file = args['local_file']
else:
- dataset = None
+ local_file = None
# group
if args['group'] != None:
@@ -41,9 +41,9 @@
else:
group = None
-# dataset and/or group is required
-if dataset == None and group == None:
- raise Exception("A dataset or group is required.")
+# local_file and/or group is required
+if local_file == None and group == None:
+ raise Exception("A local_file or group is required.")
# folder
if args['folder'] != None:
@@ -67,9 +67,9 @@
# create base command line
command_line = create_base_command_line(__command, connection_string, dry_run, mode)
-# append dataset
-if dataset != None:
- command_line += ' --dataset %s' % dataset
+# append local_file
+if local_file != None:
+ command_line += ' --local_file %s' % local_file
# append version
if version != None:
java/sandbox/data-cat/src/main/python/hpsdatacat
--- java/sandbox/data-cat/src/main/python/hpsdatacat/crawler.py (rev 0)
+++ java/sandbox/data-cat/src/main/python/hpsdatacat/crawler.py 2014-06-04 02:04:40 UTC (rev 663)
@@ -0,0 +1,156 @@
+#!/usr/bin/env python
+
+"""
+Directory crawler that will find and add new files to the data catalog.
+Files with a creation time after the modification time of a timestamp
+file are registered. It uses the register.py script to perform the
+dataset registration.
+"""
+
+import os
+from util import *
+from extract_metadata import *
+
+# file extensions that will be looked at by default
+__default_extensions = ('slcio', 'evio', 'root')
+
+# name of command, but does not correspond to anything in the data catalog API
+__command = 'crawler'
+
+# default site
+site = get_default_site()
+
+# default group
+group = get_default_group()
+
+# default data catalog root path for file registration
+default_base_folder = '/HPS'
+
+# path to script for registering new files
+register_script = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'register.py')
+
+# command line options
+parser = create_base_argparser(__command)
+parser.add_argument('-d', '--directory', help='starting base_directory to crawl')
+parser.add_argument('-t', '--timestamp', help='files newer than the timestamp file will be registered')
+parser.add_argument('-e', '--extension', help='only handle files with given extension')
+parser.add_argument('-g', '--group', help='dataset group')
+parser.add_argument('-s', '--site', help='dataset site e.g. SLAC or JLAB')
+parser.add_argument('-p', '--path', help='path in the data catalog')
+args = vars(parser.parse_args())
+
+connection_string, dry_run, mode = handle_standard_arguments(args)
+
+# Try to figure out a default connection string if none was supplied.
+if connection_string == None:
+ connection_string = get_ssh_connection_string()
+ if connection_string == None:
+ raise Exception("Couldn't figure out a connection_string to use!")
+
+# base directory to crawl
+if args['directory'] != None:
+ base_directory = args['directory']
+else:
+ raise Exception("The directory is required!")
+
+# timestamp file to use
+if args['timestamp'] != None:
+ timestamp_file_path = args['timestamp']
+ if not os.path.isfile(timestamp_file_path):
+ raise Exception("The timestamp file %s does not exist!" % timestamp_file_path)
+else:
+ raise Exception("The timestamp file is a required argument!")
+
+# the timestamp for comparison is the modification time of the timestamp file
+timestamp = os.path.getmtime(timestamp_file_path)
+print 'using timestamp ', str(timestamp)
+
+# dataset group
+if args['group'] != None:
+ group = args['group']
+
+# dataset site
+if args['site'] != None:
+ site = args['site']
+ check_valid_site(site)
+
+# path in data catalog
+datacat_path = None
+if args['path'] != None:
+ datacat_path = args['path']
+
+# file extension to process
+handle_extensions = __default_extensions
+if args['extension'] != None:
+ # only look at files with extension matching argument
+ handle_extensions = (args['extension'])
+
+# walk the directory tree
+for dirname, dirnames, filenames in os.walk(base_directory):
+ # ignore directories starting with a '.'
+ if dirname[0] == '.':
+ continue
+ # process files
+ for filename in filenames:
+
+ # get file path and extension
+ full_path = os.path.join(dirname, filename)
+ extension = os.path.splitext(full_path)[1][1:]
+
+ # process file if it is a valid extension
+ if extension in handle_extensions:
+
+ # get the creation time of the file
+ file_ctime = os.path.getctime(full_path)
+
+ # register files with creation time greater than modification time of the timestamp file
+ if file_ctime > timestamp:
+
+ print 'found file %s with creation time %f' % (full_path, file_ctime)
+
+ # extract meta data
+ metadata_extractor = get_metadata_extractor(full_path)
+ if (metadata_extractor == None):
+ raise Exception("A MetaDataExtractor for %s was not found!" % full_path)
+ metadata_extractor.extract_metadata(full_path)
+ metadata = metadata_extractor.to_define_string()
+
+ # relative path
+ rel_path = full_path.replace(base_directory, '')
+ base_path = os.path.dirname(rel_path)
+
+ # figure out the folder to use in the data catalog
+ if datacat_path == None:
+ # folder from structure under root directory
+ datacat_folder = default_base_folder
+ datacat_folder += os.path.dirname(rel_path)
+ else:
+ # folder from command line argument
+ datacat_folder = datacat_path
+
+ # build the command line
+ command_line = register_script
+ if dry_run == True:
+ command_line += ' --dry-run'
+ if mode != None:
+ command_line += ' --mode ' % mode
+ command_line += ' -c %s' % connection_string
+ command_line += ' --file %s' % full_path
+ command_line += ' --path %s' % datacat_folder
+ command_line += ' --group %s' % group
+ command_line += ' --site %s' % site
+ command_line += '%s' % metadata
+
+ # run the register command and print results
+ print "Registering new file with command ..."
+ print command_line
+ lines, errors, return_value = run_process(command_line, False)
+ if len(errors) > 0 or return_value != 0:
+ print "Command returned with an error!"
+ # just print the first error
+ print errors[0]
+ else:
+ print "File successfully registered!"
+
+# touch the timestamp file to update its modification time
+os.utime(timestamp_file_path, None)
\ No newline at end of file
java/sandbox/data-cat/src/main/python/hpsdatacat
--- java/sandbox/data-cat/src/main/python/hpsdatacat/delete.py 2014-06-04 02:03:54 UTC (rev 662)
+++ java/sandbox/data-cat/src/main/python/hpsdatacat/delete.py 2014-06-04 02:04:40 UTC (rev 663)
@@ -1,5 +1,7 @@
+#!/usr/bin/env python
+
"""
-Delete a file from the data catalog.
+Delete a file from the data catalog by using the 'rm' command.
"""
import argparse
java/sandbox/data-cat/src/main/python/hpsdatacat
--- java/sandbox/data-cat/src/main/python/hpsdatacat/extract_metadata.py (rev 0)
+++ java/sandbox/data-cat/src/main/python/hpsdatacat/extract_metadata.py 2014-06-04 02:04:40 UTC (rev 663)
@@ -0,0 +1,166 @@
+#!/usr/bin/env python
+
+"""
+Utility classes for automatically extracting meta data from LCIO, EVIO and ROOT files.
+
+In order for these classes to work, some setup needs to be done first ...
+
+LCIO must have been compiled and installed with ROOT dictionary support enabled.
+
+The following environment must be setup for the LCIO and ROOT Python bindings to work:
+
+ export PYTHONPATH=$LCIO/src/python/:$ROOTSYS/lib
+ export LCIO=$LCIO/build
+
+There is no Python binding for EVIO, so an external Java class is used from the hps-data-cat module
+to extract the meta data.
+
+The jar containing this class must be added to the classpath:
+
+ export CLASSPATH=/path/to/hps/java/sandbox/data-cat/target/hps-datacat-0.1-SNAPSHOT-bin.jar
+
+The correct 'java' binary must also be present in the shell environment of the Python interpreter.
+"""
+
+import sys, os, ast
+from util import *
+
+suppress_print()
+from pyLCIO import IOIMPL
+restore_print()
+
+import ROOT
+
+# Java class used to extract information from EVIO files
+evio_extractor_java_class = "org.hps.datacat.EvioMetaDataPythonTool"
+
+"""
+Base class for extracting meta data from a file.
+"""
+class MetaDataExtractor:
+
+ def extract_metadata(self, file_path):
+ pass
+
+ def handles_extension(self):
+ pass
+
+ def get_metadata(self):
+ return self.metadata
+
+ def to_define_string(self):
+ #print self.metadata
+ define_string = ''
+ for key, value in self.metadata.items():
+ define_string += ' --define %s=\"%s\"' % (key, value)
+ return define_string
+
+"""
+Extract meta data from an LCIO file.
+"""
+# TODO: add min and max event numbers by reading through whole file
+class LcioMetaDataExtractor(MetaDataExtractor):
+
+ def extract_metadata(self, file_path):
+ self.metadata = {}
+ reader = IOIMPL.LCFactory.getInstance().createLCReader()
+ reader.open(file_path)
+ runNumber = None
+ detectorName = None
+ collectionNames = []
+ for event in reader:
+ runNumber = event.getRunNumber()
+ detectorName = event.getDetectorName()
+ for collectionName, collection in event:
+ #print '\t%s of type %s with %d elements' % ( collectionName, collection.getTypeName(), collection.getNumberOfElements() )
+ collectionNames.append(collectionName)
+ break
+ reader.close()
+ self.metadata['nRun'] = runNumber
+ self.metadata['sDetectorName'] = detectorName
+ self.metadata['sCollectionNames'] = ",".join(collectionNames)
+
+ def handles_extension(self):
+ return 'slcio'
+
+"""
+Extract meta data from a ROOT DST file.
+"""
+class RootDstMetaDataExtractor(MetaDataExtractor):
+
+ def extract_metadata(self, file_path):
+ self.metadata = {}
+ suppress_print()
+ file = ROOT.TFile(file_path)
+ tree = file.Get("HPS_Event")
+ tree.GetEntry(0)
+ run_number = tree.GetLeaf("run_number").GetValue(0)
+ restore_print()
+ self.metadata['nRun'] = int(run_number)
+
+ def handles_extension(self):
+ return 'root'
+
+"""
+Extract meta data from an EVIO file.
+"""
+class EvioMetaDataExtractor(MetaDataExtractor):
+
+ def extract_metadata(self, file_path):
+ command_line = 'java %s %s' % (evio_extractor_java_class, file_path)
+ lines, errors, return_value = run_process(command_line)
+ if len(errors) != 0 or return_value != 0:
+ raise Exception("Call to %s failed!" % evio_extractor_java_class)
+ line = lines[0]
+ self.metadata = ast.literal_eval(line)
+
+ def handles_extension(self):
+ return 'evio'
+
+# list of meta data extractors
+metadata_extractors = (LcioMetaDataExtractor(), RootDstMetaDataExtractor(), EvioMetaDataExtractor())
+
+# get a meta data extractor for a file
+def get_metadata_extractor(file_path):
+ file_extension = os.path.splitext(file_path)[1][1:]
+ for extractor in metadata_extractors:
+ if extractor.handles_extension() == file_extension:
+ return extractor
+ return None
+
+if __name__ == '__main__':
+
+ if len(sys.argv) < 2:
+ raise Exception("File is required argument!")
+ file_path = sys.argv[1]
+
+ extractor = get_metadata_extractor(file_path)
+ if extractor == None:
+ raise Exception("No MetaDataExtractor found for %s file." % file_path)
+ extractor.extract_metadata(file_path)
+ print "Extracted meta data ..."
+ print extractor.get_metadata()
+
+ # test on LCIO file
+ #lcioFile = '/nfs/slac/g/hps3/data/datacat-test/data/hps_testrun_001351.slcio'
+ #print 'testing LcioMetaDataExtractor on %s' % lcioFile
+ #extractor = LcioMetaDataExtractor()
+ #extractor.extract_metadata(lcioFile)
+ #print extractor.get_metadata()
+ #print
+
+ # test on EVIO file
+ #evioFile = '/nfs/slac/g/hps3/data/datacat-test/data/hps_001351.evio'
+ #print 'testing EvioMetaDataExtractor on %s' % evioFile
+ #extractor = EvioMetaDataExtractor()
+ #extractor.extract_metadata(evioFile)
+ #print extractor.get_metadata()
+ #print
+
+ # test on ROOT file
+ #rootFile = '/nfs/slac/g/hps3/data/testrun/runs/recon_new/hps_001351.evio.0_recon.root'
+ #print 'testing RootDstDataExtractor on %s' % rootFile
+ #extractor = RootDstMetaDataExtractor()
+ #extractor.extract_metadata(rootFile)
+ #print extractor.get_metadata()
+
\ No newline at end of file
java/sandbox/data-cat/src/main/python/hpsdatacat
--- java/sandbox/data-cat/src/main/python/hpsdatacat/find.py 2014-06-04 02:03:54 UTC (rev 662)
+++ java/sandbox/data-cat/src/main/python/hpsdatacat/find.py 2014-06-04 02:04:40 UTC (rev 663)
@@ -1,22 +1,17 @@
-#!/usr/bin/env python2.7
+#!/usr/bin/env python
"""
-Script wrapper for finding files in the data catalog.
-
-author: Jeremy McCormick <[log in to unmask]>
+Find files in the data catalog using the 'find' command.
"""
-# Python lib imports
import argparse, os.path, subprocess, socket, getpass
-
-# import utility stuff from hpsdatacat
from util import *
# data catalog command to be executed
__command = 'find'
# default options for search command
-script_options = '--search-groups --recurse'
+__script_options = '--search-groups --recurse'
# default path in the data catalog
default_path = get_default_search_path()
@@ -64,7 +59,7 @@
command_line = create_base_command_line(__command, connection_string, dry_run, mode)
# add the standard options for this command
-command_line += ' %s' % script_options
+command_line += ' %s' % __script_options
# add site
command_line += ' --site %s' % site
java/sandbox/data-cat/src/main/python/hpsdatacat
--- java/sandbox/data-cat/src/main/python/hpsdatacat/register.py 2014-06-04 02:03:54 UTC (rev 662)
+++ java/sandbox/data-cat/src/main/python/hpsdatacat/register.py 2014-06-04 02:04:40 UTC (rev 663)
@@ -1,9 +1,7 @@
-#!/usr/bin/env python2.7
+#!/usr/bin/env python
"""
-Script wrapper for updating HPS data catalog via SSH connection_string to SLAC.
-
-author: Jeremy McCormick <[log in to unmask]>
+Register new files in the data catalog using the 'registerDataset' command.
"""
import argparse, os.path, subprocess, socket, getpass
@@ -19,11 +17,11 @@
site = get_default_site()
parser = create_base_argparser(__command)
-parser.add_argument('-p', '--path', help='destination path in data catalog')
-parser.add_argument('-d', '--dataset', help='input physical dataset')
-parser.add_argument('-m', '--metadata', help='define a meta data field value with format key=value', action='append')
-parser.add_argument('-g', '--group', help='dataset group')
-parser.add_argument('-s', '--site', help='dataset site')
+parser.add_argument('-p', '--path', help='destination folder in the data catalog')
+parser.add_argument('-f', '--file', help='input physical file to register')
+parser.add_argument('-d', '--define', help='define a single meta data field with format key=value', action='append')
+parser.add_argument('-g', '--group', help='group under the path')
+parser.add_argument('-s', '--site', help='site of the new file e.g. SLAC or JLAB')
args = vars(parser.parse_args())
connection_string, dry_run, mode = handle_standard_arguments(args)
@@ -31,21 +29,21 @@
if args['path'] != None:
path = args['path']
else:
- raise Exception('path is required!')
+ raise Exception('The destination path is required!')
-if args['dataset'] != None:
- dataset = args['dataset']
+if args['file'] != None:
+ local_file = args['file']
else:
- raise Exception('dataset is required!')
+ raise Exception('The local file is required!')
-file_extension = os.path.splitext(dataset)[1][1:]
+file_extension = os.path.splitext(local_file)[1][1:]
if args['group'] != None:
group = args['group']
metadata = None
-raw_metadata = args['metadata']
-if args['metadata'] != None:
+raw_metadata = args['define']
+if args['define'] != None:
metadata = format_metadata(raw_metadata)
if args['site'] != None:
@@ -68,16 +66,16 @@
if metadata != None:
command_line += ' %s' % metadata
-# append file type, path and dataset
-command_line += ' %s %s %s' % (file_extension, path, dataset)
+# append file type, path and local_file
+command_line += ' %s %s %s' % (file_extension, path, local_file)
# run the command
lines, errors, return_value = run_process(command_line)
-# print dataset information if command was successful
+# print local_file information if command was successful
if return_value == 0:
- print 'Added dataset to catalog ...'
- print ' dataset: %s' % dataset
+ print 'Added local_file to catalog ...'
+ print ' local_file: %s' % local_file
print ' path: %s' % path
print ' group: %s' % group
print ' site: %s' % site
java/sandbox/data-cat/src/main/python/hpsdatacat
--- java/sandbox/data-cat/src/main/python/hpsdatacat/util.py 2014-06-04 02:03:54 UTC (rev 662)
+++ java/sandbox/data-cat/src/main/python/hpsdatacat/util.py 2014-06-04 02:04:40 UTC (rev 663)
@@ -1,15 +1,16 @@
+#!/usr/bin/env python
+
"""
-Utility function library for HPS data catalog wrapper scripts.
+Miscellaneous global utility functions.
"""
-# Python lib imports
-import getpass, socket, subprocess, argparse
+import sys, os, getpass, socket, subprocess, argparse
# location of data catalog script at SLAC
__datacat_script = '~srs/datacat/prod/datacat-hps'
-# commands that have script wrappers for them
-__valid_commands = ('rm', 'registerDataset', 'addLocation', 'addMetaData', 'find')
+# valid commands
+__valid_commands = ('rm', 'registerDataset', 'addLocation', 'addMetaData', 'find', 'crawler')
# valid mode settings
__valid_modes = ('PROD', 'DEV', 'TEST')
@@ -108,7 +109,7 @@
return raw_string.replace('"', '\\"').replace(' ', '\\ ').replace('&', '\\&')
"""
-Format meta data for a command from the command line arguments.
+Format meta data for an SSH command from the command line arguments.
This function will return None if raw_metadata is empty.
"""
def format_metadata(raw_metadata):
@@ -131,7 +132,7 @@
parser = argparse.ArgumentParser(description='Execute the %s command on the data catalog' % command)
parser.add_argument('-D', '--dry-run', help='perform dry run only with no database commits', action='store_true')
parser.add_argument('-M', '--mode', help='set data source as PROD, DEV, or TEST')
- parser.add_argument('-c', '--connection_string', help='SSH connection string in form user@host')
+ parser.add_argument('-c', '--connection-string', help='SSH connection string in form user@host')
return parser
"""
@@ -154,7 +155,7 @@
return connection_string, dry_run, mode
"""
-Print the result of running the command.
+Print the results of running a command.
"""
def print_result(command, return_value, errors, printSuccessful=True):
if return_value != 0 or len(errors) != 0:
@@ -168,7 +169,7 @@
print "return_value: %s" % str(return_value)
"""
-Create the basic SSH command.
+Create the basic SSH command from common arguments.
"""
def create_base_command_line(command, connection_string, dry_run, mode):
command_line = 'ssh %s' % (connection_string)
@@ -180,8 +181,22 @@
return command_line
"""
-Check if site is valid.
+Check if a site looks valid.
"""
def check_valid_site(site):
if site not in __valid_sites:
- raise Exception("Site is not valid: " + site)
\ No newline at end of file
+ raise Exception("Site is not valid: " + site)
+
+"""
+Send stdout and stderr to /dev/null e.g. to suppress messages.
+"""
+def suppress_print():
+ sys.stdout = open(os.devnull, 'w')
+ sys.stderr = open(os.devnull, 'w')
+
+"""
+Restore stdout and stderr after supressing them.
+"""
+def restore_print():
+ sys.stdout = sys.__stdout__
+ sys.stderr = sys.__stderr__
\ No newline at end of file
SVNspam 0.1