Source code for readers.ge

import sys
import pprint
import xmltodict
import configparser
import pandas as pd
import pathlib
from datetime import datetime
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl import load_workbook

[docs] def extract_meta_from_config(file_name): # Exact meta data from config file (pca) try: with open(file_name, encoding='latin-1') as f: config = configparser.ConfigParser(interpolation=None) config.read_file(f) sections = config.sections() meta_dict = {i: {i[0]: i[1] for i in config.items(i)} for i in config.sections()} # for key in meta_dict: # print(key, meta_dict[key]) except FileNotFoundError: print("ERROR: %s is missing. Looking for a _rar file" % file_name) fname = pathlib.Path(file_name) fname_suffix = fname.suffix file_name_rar = fname.with_name(file_name.stem + '_rar').with_suffix(fname_suffix) print('Found %s file' % file_name_rar) try: with open(file_name_rar, encoding='latin-1') as f: config = configparser.ConfigParser() config.read_file(f) sections = config.sections() meta_dict = {i: {i[0]: i[1] for i in config.items(i)} for i in config.sections()} except: print("ERROR: %s is also missing" % file_name_rar) exit() return meta_dict
[docs] def extract_meta_from_dtxml(file_name): # Extract meta data from dtxml file with open(file_name, encoding='latin-1') as f: xml_content= f.read() xml_dict=xmltodict.parse(xml_content) i = 0 meta_dict = {} while i < len(xml_dict['project']['additional_project_info']['property']): meta_dict[(xml_dict['project']['additional_project_info']['property'][i]['@name'])] = xml_dict['project']['additional_project_info']['property'][i]['@value'] i = i + 1 # pprint.pprint(meta_dict) return meta_dict
[docs] def extract_meta_from_pcp(file_name): # Extract meta data from pcp file try: with open(file_name) as file: time_start = file.readlines()[2].split("\t")[-1].strip() datetime_start = datetime.strptime(time_start, '%Y-%m-%d %H:%M:%S') file.seek(0) time_end = file.readlines()[-1].split("\t")[-1].strip() datetime_end = datetime.strptime(time_end, '%Y-%m-%d %H:%M:%S') scan_time = datetime_end - datetime_start except FileNotFoundError: print("ERROR: %s is missing. Looking for a _rar file" % file_name) fname = pathlib.Path(file_name) fname_suffix = fname.suffix file_name_rar = fname.with_name(file_name.stem + '_rar').with_suffix(fname_suffix) print('Found %s file' % file_name_rar) try: with open(file_name_rar, encoding='latin-1') as file: time_start = file.readlines()[2].split("\t")[-1].strip() datetime_start = datetime.strptime(time_start, '%Y-%m-%d %H:%M:%S') file.seek(0) time_end = file.readlines()[-1].split("\t")[-1].strip() datetime_end = datetime.strptime(time_end, '%Y-%m-%d %H:%M:%S') scan_time = datetime_end - datetime_start except: print("ERROR: %s is also missing" % file_name_rar) exit() return scan_time, datetime_start
[docs] def main(args): if len(sys.argv) == 1: print ('ERROR: Must provide the path to a run-file folder as the argument') print ('Example:') print (' python ge.py /Users/decarlo/conda/nocturn/data/FEG230530_413/') sys.exit(1) else: for arg in sys.argv[1:]: print('Looking into %s' % arg) file_name = arg p = pathlib.Path(file_name) if p.is_dir(): p = pathlib.Path(file_name).joinpath(p.stem) file_name_pca = p.with_suffix('.pca') file_name_pcp = p.with_suffix('.pcp') file_name_pcr = p.with_suffix('.pcr') file_name_dtxml = p.with_suffix('.dtxml') file_name_xlsx = p.parents[1].joinpath('master').with_suffix('.xlsx') # print('1', p) # print('2', p.suffix) # print('3', p.parents[0]) # print('3', p.parents[1]) # print('4', p.stem) # print('5', p.suffix) # print(file_name_pca ) # print(file_name_pcp ) # print(file_name_pcr ) # print(file_name_dtxml) # print(file_name_xlsx ) my_dict = {} scan_time, datetime_start = extract_meta_from_pcp(file_name_pcp) my_pcr_dict = extract_meta_from_config(file_name_pcr) my_pca_dict = extract_meta_from_config(file_name_pca) my_xml_dict = extract_meta_from_dtxml(file_name_dtxml) # Dictionary keys assignment. The order of the assigment will sort the xlsx file columns my_dict['scan date'] = datetime_start my_dict['Operator'] = my_xml_dict['Operator'] my_dict['Researcher'] = my_xml_dict['Researcher'] my_dict['NMNH PI'] = my_xml_dict['NMNH PI'] my_dict['Department'] = my_xml_dict['Department'] my_dict['CT project #'] = my_xml_dict['Project Number'] my_dict['specimen ID or USNM#'] = my_xml_dict['Sample ID'] my_dict['Species name'] = my_xml_dict['Sample Name'] my_dict['stain'] = my_xml_dict['Description'] try: my_dict['Sample type'] = my_xml_dict['Sample Type'] except KeyError: print('Sample type is missing, added empty field') my_dict['Sample type'] = 'Empty' my_dict['folder name'] = p.stem # my_dict['Folder'] = my_pcr_dict['ImageData']['pca_file'] my_dict['timing (ms)'] = my_pca_dict['Detector']['timingval'] my_dict['frame avg'] = my_pca_dict['Detector']['avg'] my_dict['skip'] = my_pca_dict['CT']['skipacc'] my_dict['binning'] = my_pca_dict['Detector']['binning'] my_dict['sensitivity'] = my_pca_dict['Detector']['cameragain'] my_dict['# images'] = my_pca_dict['CT']['numberimages'] my_dict['total scanning time (hrs)'] = str(scan_time) my_dict['voltage (kV)'] = my_pca_dict['Xray']['voltage'] my_dict['current (uA)'] = my_pca_dict['Xray']['current'] my_dict['act. power (W)'] = "Manual Entry" my_dict['magnification'] = my_pca_dict['Geometry']['magnification'] my_dict['voxel size (um)'] = float(my_pca_dict['Geometry']['voxelsizex']) * 1000.0 my_dict['tube type'] = my_pca_dict['Xray']['name'] my_dict['mode'] = my_pca_dict['Xray']['mode'] my_dict['multiscan (# of scans)'] = my_pca_dict['Multiscan']['active'] my_dict['filter'] = "Manual Entry" my_dict['target'] = "Manual Entry" my_dict['collimator'] = my_pca_dict['Xray']['collimation'] # Specific to Freya my_dict['Trawl number or collection event ID'] = "Manual Entry" my_dict['same specimen as Scan x'] = "Manual Entry" my_dict['specimen pixel width'] = "Manual Entry" # additional meta data my_dict['general system name'] = my_pca_dict['General']['systemname'] df = pd.DataFrame(data={**my_dict}, index=[0]) xlsx = pathlib.Path(file_name_xlsx) if xlsx.is_file(): wb = load_workbook(filename = file_name_xlsx) ws = wb["Sheet1"] for r in dataframe_to_rows(df, index=False, header=False): #No index and don't append the column headers ws.append(r) wb.save(file_name_xlsx) # df.to_excel(file_name_xlsx, header=False, index=False) print('Append to existing meta-data excel file at: %s' % file_name_xlsx) else: df.to_excel(file_name_xlsx, header=True, index=False) print('Create a new meta-data excel file at: %s' % file_name_xlsx) else: print('ERROR: %s does not exist' % p)
if __name__ == "__main__": main(sys.argv)