diff --git a/README.md b/README.md index c0c1301..08513a5 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# archer_archiving v1.0.0 +# archer_archiving v1.1.0 This script is for backing up data from the Archer analysis platform to DNAnexus for long term storage. The Archer analysis software runs on a virtual server with limited storage space (1TB), and new analyses fail if space runs out. Once projects have been archived within the Archer analysis platform (set to be performed automatically after a certain time period), the project folder contents on the archer platform are copied to the Genomics Server with rsync, compressed with tar and uploaded to the relevant DNAnexus project. The project folder and associated fastq files are then deleted from the Archer server, leaving the (empty) project folder in place. @@ -13,7 +13,7 @@ Once projects have been archived within the Archer analysis platform (set to be ## Docker The scripts can be run from within a docker container when docker = True in the archer_archive_config.py file. This can be run using the command -`sudo docker run --rm -v /usr/local/src/mokaguys/logfiles:/mokaguys/logfiles -v /usr/local/src/mokaguys/dx_downloads:/mokaguys/dx_downloads -v /usr/local/src/mokaguys/.dnanexus_auth_token:/mokaguys/.dnanexus_auth_token -v /usr/local/src/mokaguys/.archerVM_pw:/mokaguys/.archerVM_pw archer_archiving:latest` +`sudo docker run --rm --log-driver syslog -v /var/log:/var/log -v /usr/local/src/mokaguys/logfiles:/mokaguys/logfiles -v /usr/local/src/mokaguys/dx_downloads:/mokaguys/dx_downloads -v /usr/local/src/mokaguys/.dnanexus_auth_token:/mokaguys/.dnanexus_auth_token -v /usr/local/src/mokaguys/.archerVM_pw:/mokaguys/.archerVM_pw archer_archiving:latest` (replacing the tag `latest` as required). ### using ssh within the Docker image diff --git a/archer_archive_script.py b/archer_archive_script.py index d7b85af..6d075d3 100644 --- a/archer_archive_script.py +++ b/archer_archive_script.py @@ -46,13 +46,14 @@ def set_up_ssh_known_hosts(self): cmd="mkdir -p ~/.ssh; touch ~./ssh/known_hosts;\ if [ -z $(ssh-keygen -F grpvgaa01.viapath.local) ]; then \ ssh-keyscan -H grpvgaa01.viapath.local >> ~/.ssh/known_hosts; fi; ssh-keygen -F grpvgaa01.viapath.local" + self.script_logfile.write("\tCommand to set up ssh known hosts: '%s'\n" % (cmd)) out, err = self.execute_subprocess_command(cmd) if self.success_in_stdout(out, "Host grpvgaa01.viapath.local found"): - self.logger("host added to known hosts ok", "SSH set up") + self.logger("host added to known hosts ok", "Archer archive SSH set up") return True else: - self.logger("host NOT added to known hosts", "SSH set up") # Rapid7 alert set up + self.logger("host NOT added to known hosts", "Archer archive SSH set up") # Rapid7 alert set up return False def list_archer_projects(self): @@ -71,11 +72,10 @@ def list_archer_projects(self): sshpass -p $archer_pw ssh s_archerupload@grpvgaa01.viapath.local ls %s" % ( config.path_to_archerdx_pw, config.path_to_analysis_folder) + self.script_logfile.write("\tCommand to list Archer projects: '%s'\n" % (cmd)) out,err = self.execute_subprocess_command(cmd) - self.logger("command run to list projects: %s" % (cmd), "Archer archive") # for each item in the out (list of items in the /var/www/analysis folder) yeild the name if length=4 for folder_name in out.split("\n"): - self.logger("checking folder %s" % (folder_name), "Archer archive") if len(folder_name) == 4: self.logger("identified project %s" % (folder_name), "Archer archive") yield folder_name @@ -121,6 +121,7 @@ def check_project_archived(self,archer_project_ID): sshpass -p $archer_pw ssh s_archerupload@grpvgaa01.viapath.local ls %s" % ( config.path_to_archerdx_pw, os.path.join(config.path_to_analysis_folder,archer_project_ID)) + self.script_logfile.write("\tCommand to list project contents: '%s'\n" % (cmd)) out,err = self.execute_subprocess_command(cmd) # look through list of contents for file "archer_project_ID.tar.gz" archer_tar = archer_project_ID + ".tar.gz" @@ -155,6 +156,7 @@ def list_archer_project_files(self,archer_project_ID): config.path_to_archerdx_pw, os.path.join(config.path_to_analysis_folder,archer_project_ID), fastq_loc_file) + self.script_logfile.write("\tCommand to list and record project files: '%s'\n" % (cmd)) out,err = self.execute_subprocess_command(cmd) # check for errors in the stdout if self.success_in_stdout(out.rstrip(), "0"): @@ -191,9 +193,9 @@ def copy_archer_project(self,archer_project_ID): os.path.join(config.path_to_analysis_folder,archer_project_ID), config.copy_location ) + self.script_logfile.write("\tCommand to copy archer project files: '%s'\n" % (cmd)) # capture stdout and look for exit code out,err = self.execute_subprocess_command(cmd) - self.logger("rsync cmd: %s\nout: %s" % (cmd,out), "Archer archive") if self.success_in_stdout(out.rstrip(), "0"): self.logger("folder for Archer project %s copied to genomics server." % (archer_project_ID), "Archer archive") return True @@ -215,8 +217,8 @@ def create_project_tar(self,archer_project_ID): # redirect stderr to stdout so we can test for errors tarfile_name = "%s.tar.gz" % (archer_project_ID) cmd = "cd %s; tar -czf %s %s 2>&1" % (config.copy_location,tarfile_name,archer_project_ID) + self.script_logfile.write("\tCommand to create tar archive on genomics server: '%s'\n" % (cmd)) out, err = self.execute_subprocess_command(cmd) - self.logger("tar cmd: %s\nout: %s" % (cmd,out), "Archer archive") # assess stdout+stderr - if successful tar does not return any output if len(out) ==0: self.logger("Tar of archer project %s generated successfully" % (archer_project_ID),"Archer archive") @@ -235,8 +237,8 @@ def find_DNAnexus_project(self,archer_project_ID,project_adx): # search DNAnexus for project matching project_adx (ADX###) #cmd = config.source_command+";dx find projects --name='*%s*' --auth-token %s" % (project_adx,config.Nexus_API_Key) cmd = "dx find projects --name='*%s*' --auth-token %s" % (project_adx,config.Nexus_API_Key) + self.script_logfile.write("\tCommand to find DNAnexus project: '%s'\n" % (cmd)) out,err = self.execute_subprocess_command(cmd) - self.logger("find_DNAnexus_project() cmd: %s" % (cmd),"Archer archive") # count number of projects returned. If unable to identify a single matching project return error # note: if one project found len(matchingprojects)=2 because there is a newline matching_projects = out.split("\n") @@ -264,8 +266,8 @@ def upload_to_dnanexus(self,file_list,dnanexus_projectname): config.Nexus_API_Key, dnanexus_projectname, list_of_files) + self.script_logfile.write("\tCommand to upload files to DNAnexus using upload agent: '%s'\n" % (cmd)) out,err = self.execute_subprocess_command(cmd) - self.logger("dnanexus upload agent command: %s\nout: %s\n%s" % (cmd,out,err),"Archer archive") # check output of this command if out.startswith("file-"): #if self.success_in_stdout(out,"file*"): @@ -293,9 +295,8 @@ def cleanup_archer_project_folder(self,archer_project_ID): sshpass -p $archer_pw ssh s_archerupload@grpvgaa01.viapath.local rm -r %s/*; echo $?" % ( config.path_to_archerdx_pw, os.path.join(config.path_to_analysis_folder,archer_project_ID)) - self.logger("command to cleanup project on archer server: %s" %(cmd), "Archer archive") + self.script_logfile.write("\tCommand to cleanup project on archer server: '%s'\n" % (cmd)) out,err = self.execute_subprocess_command(cmd) - self.logger("clean up archer project cmd: %s\nout: %s" % (cmd,out),"Archer archive") # check for success in stdout if self.success_in_stdout(out,"0"): self.logger("Archer project folder %s emptied." % (archer_project_ID),"Archer archive") @@ -319,8 +320,7 @@ def cleanup_archer_fastqs(self,project_adx): sshpass -p $archer_pw ssh s_archerupload@grpvgaa01.viapath.local rm %s; echo $?" % ( config.path_to_archerdx_pw, path_to_fastqs) - - self.logger("command to cleanup fastq files: %s" % (cmd), "Archer archive") + self.script_logfile.write("\tCommand to cleanup archer FASTQs: '%s'\n" % (cmd)) out,err = self.execute_subprocess_command(cmd) # check for success in stdout if self.success_in_stdout(out,"0"): @@ -347,10 +347,10 @@ def list_archer_fastq_for_deletion(self,project_adx): sshpass -p $archer_pw ssh s_archerupload@grpvgaa01.viapath.local ls %s; echo $?" % ( config.path_to_archerdx_pw, path_to_fastqs) - + self.script_logfile.write("\tCommand to list Archer FASTQs for deletion: '%s'\n" % (cmd)) out,err = self.execute_subprocess_command(cmd) # write the list of files to the log - self.logger("List of fastq files to be deleted from %s for project %s:\n%s" % (path_to_fastqs,project_adx,out),"Archer archive") + self.script_logfile.write("\tList of fastq files to be deleted from %s for project %s:\n%s" % (path_to_fastqs,project_adx,out)) # return the file path to be used by clean_up_archer_fastqs() return path_to_fastqs @@ -364,7 +364,7 @@ def update_list_archived_projects(self,archer_project_ID,project_adx): # open the archived projects file and add the project ID of the archived project to the list with open(config.path_to_archived_project_ids,"a") as archived_projects_list: archived_projects_list.write("%s\n" % (archer_project_ID)) - self.logger("Project ID %s added to archived projects list" % (project_adx),"Archer archive") + self.script_logfile.write("\tProject ID %s added to archived projects list" % (project_adx),"Archer archive") def cleanup_genomics_server(self,archer_project_ID): """ @@ -377,6 +377,7 @@ def cleanup_genomics_server(self,archer_project_ID): path_to_project_folder = os.path.join(config.copy_location,"%s" % (archer_project_ID)) # command to delete the downloaded fastq files cmd = "rm -r %s*; echo $?" % (path_to_project_folder) + self.script_logfile.write("\tCommand to clean up Genomics Server: '%s'\n" % (cmd)) out, err = self.execute_subprocess_command(cmd) if self.success_in_stdout(out, "0"): self.logger("Successfully deleted project folder and tar.gz file for project %s from genomics server" % (archer_project_ID), "Archer Archive") @@ -420,16 +421,19 @@ def logger(self, message, tool): Details about the logged event. tool (str) Tool name. Used to search within the insight ops website. + printing is required to send log information to stdout (allows logs to be sent to syslog when run in Docker) """ # Create subprocess command string, passing message and tool name to the command log = "/usr/bin/logger -t %s '%s'" % (tool, message) - + time = str('{:%Y%m%d_%H%M%S}'.format(datetime.datetime.now())) if subprocess.call([log], shell=True) == 0: # If the log command produced no errors, record the log command string to the script logfile. - self.script_logfile.write(tool + ": " + message + "\n") + self.script_logfile.write(time + " : " + tool + ": " + message + "\n") + print("%s : %s" % (tool,message)) # Else record failure to write to system log to the script log file else: - self.script_logfile.write("Failed to write log to /usr/bin/logger\n" + log + "\n") + self.script_logfile.write(time + " : Failed to write log to /var/log/syslog\n" + log + "\n") + print("Failed to write log to /var/log/syslog %s : %s" % (tool,message)) def go(self): """ diff --git a/git_tag.py b/git_tag.py index e67925d..109d302 100644 --- a/git_tag.py +++ b/git_tag.py @@ -4,13 +4,13 @@ def git_tag(): '''rather than hard code the script release, read it directly from the repository''' - # set the command which prints the git tags for the folder containing the script that is being executed. The tag looks like "v22-3-gccfd" so needs to be parsed. use awk to create an array "a", splitting on "-". The print the first element of the array - cmd = "git -C " + os.path.dirname(os.path.realpath(__file__)) + " describe --tags | awk '{split($0,a,\"-\"); print a[1]}'" + # set the command which prints the git tags for the folder containing the script that is being executed. The tag looks like "v1.2.0-3-gccfd" + cmd = "git -C " + os.path.dirname(os.path.realpath(__file__)) + " describe --tags" # use subprocess to execute command proc = subprocess.Popen([cmd], stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True) out, err = proc.communicate() # return standard out, removing any new line characters - return out.rstrip() + return out.rstrip().decode("utf-8") if __name__ == "__main__": - git_tag() \ No newline at end of file + print(git_tag()) \ No newline at end of file