Skip to content

Commit

Permalink
Merge pull request #4 from moka-guys/v1.1.0
Browse files Browse the repository at this point in the history
V1.1.0 (#4)
  • Loading branch information
Graeme-Smith authored May 23, 2023
2 parents a21d521 + 1d034ec commit a0fe235
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 24 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# archer_archiving v1.0.0
# archer_archiving v1.1.0
This script is for backing up data from the Archer analysis platform to DNAnexus for long term storage. The Archer analysis software runs on a virtual server with limited storage space (1TB), and new analyses fail if space runs out.

Once projects have been archived within the Archer analysis platform (set to be performed automatically after a certain time period), the project folder contents on the archer platform are copied to the Genomics Server with rsync, compressed with tar and uploaded to the relevant DNAnexus project. The project folder and associated fastq files are then deleted from the Archer server, leaving the (empty) project folder in place.
Expand All @@ -13,7 +13,7 @@ Once projects have been archived within the Archer analysis platform (set to be

## Docker
The scripts can be run from within a docker container when docker = True in the archer_archive_config.py file. This can be run using the command
`sudo docker run --rm -v /usr/local/src/mokaguys/logfiles:/mokaguys/logfiles -v /usr/local/src/mokaguys/dx_downloads:/mokaguys/dx_downloads -v /usr/local/src/mokaguys/.dnanexus_auth_token:/mokaguys/.dnanexus_auth_token -v /usr/local/src/mokaguys/.archerVM_pw:/mokaguys/.archerVM_pw archer_archiving:latest`
`sudo docker run --rm --log-driver syslog -v /var/log:/var/log -v /usr/local/src/mokaguys/logfiles:/mokaguys/logfiles -v /usr/local/src/mokaguys/dx_downloads:/mokaguys/dx_downloads -v /usr/local/src/mokaguys/.dnanexus_auth_token:/mokaguys/.dnanexus_auth_token -v /usr/local/src/mokaguys/.archerVM_pw:/mokaguys/.archerVM_pw archer_archiving:latest`
(replacing the tag `latest` as required).

### using ssh within the Docker image
Expand Down
40 changes: 22 additions & 18 deletions archer_archive_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,14 @@ def set_up_ssh_known_hosts(self):
cmd="mkdir -p ~/.ssh; touch ~./ssh/known_hosts;\
if [ -z $(ssh-keygen -F grpvgaa01.viapath.local) ]; then \
ssh-keyscan -H grpvgaa01.viapath.local >> ~/.ssh/known_hosts; fi; ssh-keygen -F grpvgaa01.viapath.local"
self.script_logfile.write("\tCommand to set up ssh known hosts: '%s'\n" % (cmd))
out, err = self.execute_subprocess_command(cmd)

if self.success_in_stdout(out, "Host grpvgaa01.viapath.local found"):
self.logger("host added to known hosts ok", "SSH set up")
self.logger("host added to known hosts ok", "Archer archive SSH set up")
return True
else:
self.logger("host NOT added to known hosts", "SSH set up") # Rapid7 alert set up
self.logger("host NOT added to known hosts", "Archer archive SSH set up") # Rapid7 alert set up
return False

def list_archer_projects(self):
Expand All @@ -71,11 +72,10 @@ def list_archer_projects(self):
sshpass -p $archer_pw ssh [email protected] ls %s" % (
config.path_to_archerdx_pw,
config.path_to_analysis_folder)
self.script_logfile.write("\tCommand to list Archer projects: '%s'\n" % (cmd))
out,err = self.execute_subprocess_command(cmd)
self.logger("command run to list projects: %s" % (cmd), "Archer archive")
# for each item in the out (list of items in the /var/www/analysis folder) yeild the name if length=4
for folder_name in out.split("\n"):
self.logger("checking folder %s" % (folder_name), "Archer archive")
if len(folder_name) == 4:
self.logger("identified project %s" % (folder_name), "Archer archive")
yield folder_name
Expand Down Expand Up @@ -121,6 +121,7 @@ def check_project_archived(self,archer_project_ID):
sshpass -p $archer_pw ssh [email protected] ls %s" % (
config.path_to_archerdx_pw,
os.path.join(config.path_to_analysis_folder,archer_project_ID))
self.script_logfile.write("\tCommand to list project contents: '%s'\n" % (cmd))
out,err = self.execute_subprocess_command(cmd)
# look through list of contents for file "archer_project_ID.tar.gz"
archer_tar = archer_project_ID + ".tar.gz"
Expand Down Expand Up @@ -155,6 +156,7 @@ def list_archer_project_files(self,archer_project_ID):
config.path_to_archerdx_pw,
os.path.join(config.path_to_analysis_folder,archer_project_ID),
fastq_loc_file)
self.script_logfile.write("\tCommand to list and record project files: '%s'\n" % (cmd))
out,err = self.execute_subprocess_command(cmd)
# check for errors in the stdout
if self.success_in_stdout(out.rstrip(), "0"):
Expand Down Expand Up @@ -191,9 +193,9 @@ def copy_archer_project(self,archer_project_ID):
os.path.join(config.path_to_analysis_folder,archer_project_ID),
config.copy_location
)
self.script_logfile.write("\tCommand to copy archer project files: '%s'\n" % (cmd))
# capture stdout and look for exit code
out,err = self.execute_subprocess_command(cmd)
self.logger("rsync cmd: %s\nout: %s" % (cmd,out), "Archer archive")
if self.success_in_stdout(out.rstrip(), "0"):
self.logger("folder for Archer project %s copied to genomics server." % (archer_project_ID), "Archer archive")
return True
Expand All @@ -215,8 +217,8 @@ def create_project_tar(self,archer_project_ID):
# redirect stderr to stdout so we can test for errors
tarfile_name = "%s.tar.gz" % (archer_project_ID)
cmd = "cd %s; tar -czf %s %s 2>&1" % (config.copy_location,tarfile_name,archer_project_ID)
self.script_logfile.write("\tCommand to create tar archive on genomics server: '%s'\n" % (cmd))
out, err = self.execute_subprocess_command(cmd)
self.logger("tar cmd: %s\nout: %s" % (cmd,out), "Archer archive")
# assess stdout+stderr - if successful tar does not return any output
if len(out) ==0:
self.logger("Tar of archer project %s generated successfully" % (archer_project_ID),"Archer archive")
Expand All @@ -235,8 +237,8 @@ def find_DNAnexus_project(self,archer_project_ID,project_adx):
# search DNAnexus for project matching project_adx (ADX###)
#cmd = config.source_command+";dx find projects --name='*%s*' --auth-token %s" % (project_adx,config.Nexus_API_Key)
cmd = "dx find projects --name='*%s*' --auth-token %s" % (project_adx,config.Nexus_API_Key)
self.script_logfile.write("\tCommand to find DNAnexus project: '%s'\n" % (cmd))
out,err = self.execute_subprocess_command(cmd)
self.logger("find_DNAnexus_project() cmd: %s" % (cmd),"Archer archive")
# count number of projects returned. If unable to identify a single matching project return error
# note: if one project found len(matchingprojects)=2 because there is a newline
matching_projects = out.split("\n")
Expand Down Expand Up @@ -264,8 +266,8 @@ def upload_to_dnanexus(self,file_list,dnanexus_projectname):
config.Nexus_API_Key,
dnanexus_projectname,
list_of_files)
self.script_logfile.write("\tCommand to upload files to DNAnexus using upload agent: '%s'\n" % (cmd))
out,err = self.execute_subprocess_command(cmd)
self.logger("dnanexus upload agent command: %s\nout: %s\n%s" % (cmd,out,err),"Archer archive")
# check output of this command
if out.startswith("file-"):
#if self.success_in_stdout(out,"file*"):
Expand Down Expand Up @@ -293,9 +295,8 @@ def cleanup_archer_project_folder(self,archer_project_ID):
sshpass -p $archer_pw ssh [email protected] rm -r %s/*; echo $?" % (
config.path_to_archerdx_pw,
os.path.join(config.path_to_analysis_folder,archer_project_ID))
self.logger("command to cleanup project on archer server: %s" %(cmd), "Archer archive")
self.script_logfile.write("\tCommand to cleanup project on archer server: '%s'\n" % (cmd))
out,err = self.execute_subprocess_command(cmd)
self.logger("clean up archer project cmd: %s\nout: %s" % (cmd,out),"Archer archive")
# check for success in stdout
if self.success_in_stdout(out,"0"):
self.logger("Archer project folder %s emptied." % (archer_project_ID),"Archer archive")
Expand All @@ -319,8 +320,7 @@ def cleanup_archer_fastqs(self,project_adx):
sshpass -p $archer_pw ssh [email protected] rm %s; echo $?" % (
config.path_to_archerdx_pw,
path_to_fastqs)

self.logger("command to cleanup fastq files: %s" % (cmd), "Archer archive")
self.script_logfile.write("\tCommand to cleanup archer FASTQs: '%s'\n" % (cmd))
out,err = self.execute_subprocess_command(cmd)
# check for success in stdout
if self.success_in_stdout(out,"0"):
Expand All @@ -347,10 +347,10 @@ def list_archer_fastq_for_deletion(self,project_adx):
sshpass -p $archer_pw ssh [email protected] ls %s; echo $?" % (
config.path_to_archerdx_pw,
path_to_fastqs)

self.script_logfile.write("\tCommand to list Archer FASTQs for deletion: '%s'\n" % (cmd))
out,err = self.execute_subprocess_command(cmd)
# write the list of files to the log
self.logger("List of fastq files to be deleted from %s for project %s:\n%s" % (path_to_fastqs,project_adx,out),"Archer archive")
self.script_logfile.write("\tList of fastq files to be deleted from %s for project %s:\n%s" % (path_to_fastqs,project_adx,out))
# return the file path to be used by clean_up_archer_fastqs()
return path_to_fastqs

Expand All @@ -364,7 +364,7 @@ def update_list_archived_projects(self,archer_project_ID,project_adx):
# open the archived projects file and add the project ID of the archived project to the list
with open(config.path_to_archived_project_ids,"a") as archived_projects_list:
archived_projects_list.write("%s\n" % (archer_project_ID))
self.logger("Project ID %s added to archived projects list" % (project_adx),"Archer archive")
self.script_logfile.write("\tProject ID %s added to archived projects list" % (project_adx),"Archer archive")

def cleanup_genomics_server(self,archer_project_ID):
"""
Expand All @@ -377,6 +377,7 @@ def cleanup_genomics_server(self,archer_project_ID):
path_to_project_folder = os.path.join(config.copy_location,"%s" % (archer_project_ID))
# command to delete the downloaded fastq files
cmd = "rm -r %s*; echo $?" % (path_to_project_folder)
self.script_logfile.write("\tCommand to clean up Genomics Server: '%s'\n" % (cmd))
out, err = self.execute_subprocess_command(cmd)
if self.success_in_stdout(out, "0"):
self.logger("Successfully deleted project folder and tar.gz file for project %s from genomics server" % (archer_project_ID), "Archer Archive")
Expand Down Expand Up @@ -420,16 +421,19 @@ def logger(self, message, tool):
Details about the logged event.
tool (str)
Tool name. Used to search within the insight ops website.
printing is required to send log information to stdout (allows logs to be sent to syslog when run in Docker)
"""
# Create subprocess command string, passing message and tool name to the command
log = "/usr/bin/logger -t %s '%s'" % (tool, message)

time = str('{:%Y%m%d_%H%M%S}'.format(datetime.datetime.now()))
if subprocess.call([log], shell=True) == 0:
# If the log command produced no errors, record the log command string to the script logfile.
self.script_logfile.write(tool + ": " + message + "\n")
self.script_logfile.write(time + " : " + tool + ": " + message + "\n")
print("%s : %s" % (tool,message))
# Else record failure to write to system log to the script log file
else:
self.script_logfile.write("Failed to write log to /usr/bin/logger\n" + log + "\n")
self.script_logfile.write(time + " : Failed to write log to /var/log/syslog\n" + log + "\n")
print("Failed to write log to /var/log/syslog %s : %s" % (tool,message))

def go(self):
"""
Expand Down
8 changes: 4 additions & 4 deletions git_tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@

def git_tag():
'''rather than hard code the script release, read it directly from the repository'''
# set the command which prints the git tags for the folder containing the script that is being executed. The tag looks like "v22-3-gccfd" so needs to be parsed. use awk to create an array "a", splitting on "-". The print the first element of the array
cmd = "git -C " + os.path.dirname(os.path.realpath(__file__)) + " describe --tags | awk '{split($0,a,\"-\"); print a[1]}'"
# set the command which prints the git tags for the folder containing the script that is being executed. The tag looks like "v1.2.0-3-gccfd"
cmd = "git -C " + os.path.dirname(os.path.realpath(__file__)) + " describe --tags"
# use subprocess to execute command
proc = subprocess.Popen([cmd], stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True)
out, err = proc.communicate()
# return standard out, removing any new line characters
return out.rstrip()
return out.rstrip().decode("utf-8")

if __name__ == "__main__":
git_tag()
print(git_tag())

0 comments on commit a0fe235

Please sign in to comment.