Source code for readers.ge
import sys
import pprint
import xmltodict
import configparser
import pandas as pd
import pathlib
from datetime import datetime
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl import load_workbook
[docs]
def extract_meta_from_config(file_name):
# Exact meta data from config file (pca)
try:
with open(file_name, encoding='latin-1') as f:
config = configparser.ConfigParser(interpolation=None)
config.read_file(f)
sections = config.sections()
meta_dict = {i: {i[0]: i[1] for i in config.items(i)} for i in config.sections()}
# for key in meta_dict:
# print(key, meta_dict[key])
except FileNotFoundError:
print("ERROR: %s is missing. Looking for a _rar file" % file_name)
fname = pathlib.Path(file_name)
fname_suffix = fname.suffix
file_name_rar = fname.with_name(file_name.stem + '_rar').with_suffix(fname_suffix)
print('Found %s file' % file_name_rar)
try:
with open(file_name_rar, encoding='latin-1') as f:
config = configparser.ConfigParser()
config.read_file(f)
sections = config.sections()
meta_dict = {i: {i[0]: i[1] for i in config.items(i)} for i in config.sections()}
except:
print("ERROR: %s is also missing" % file_name_rar)
exit()
return meta_dict
[docs]
def extract_meta_from_dtxml(file_name):
# Extract meta data from dtxml file
with open(file_name, encoding='latin-1') as f:
xml_content= f.read()
xml_dict=xmltodict.parse(xml_content)
i = 0
meta_dict = {}
while i < len(xml_dict['project']['additional_project_info']['property']):
meta_dict[(xml_dict['project']['additional_project_info']['property'][i]['@name'])] = xml_dict['project']['additional_project_info']['property'][i]['@value']
i = i + 1
# pprint.pprint(meta_dict)
return meta_dict
[docs]
def extract_meta_from_pcp(file_name):
# Extract meta data from pcp file
try:
with open(file_name) as file:
time_start = file.readlines()[2].split("\t")[-1].strip()
datetime_start = datetime.strptime(time_start, '%Y-%m-%d %H:%M:%S')
file.seek(0)
time_end = file.readlines()[-1].split("\t")[-1].strip()
datetime_end = datetime.strptime(time_end, '%Y-%m-%d %H:%M:%S')
scan_time = datetime_end - datetime_start
except FileNotFoundError:
print("ERROR: %s is missing. Looking for a _rar file" % file_name)
fname = pathlib.Path(file_name)
fname_suffix = fname.suffix
file_name_rar = fname.with_name(file_name.stem + '_rar').with_suffix(fname_suffix)
print('Found %s file' % file_name_rar)
try:
with open(file_name_rar, encoding='latin-1') as file:
time_start = file.readlines()[2].split("\t")[-1].strip()
datetime_start = datetime.strptime(time_start, '%Y-%m-%d %H:%M:%S')
file.seek(0)
time_end = file.readlines()[-1].split("\t")[-1].strip()
datetime_end = datetime.strptime(time_end, '%Y-%m-%d %H:%M:%S')
scan_time = datetime_end - datetime_start
except:
print("ERROR: %s is also missing" % file_name_rar)
exit()
return scan_time, datetime_start
[docs]
def main(args):
if len(sys.argv) == 1:
print ('ERROR: Must provide the path to a run-file folder as the argument')
print ('Example:')
print (' python ge.py /Users/decarlo/conda/nocturn/data/FEG230530_413/')
sys.exit(1)
else:
for arg in sys.argv[1:]:
print('Looking into %s' % arg)
file_name = arg
p = pathlib.Path(file_name)
if p.is_dir():
p = pathlib.Path(file_name).joinpath(p.stem)
file_name_pca = p.with_suffix('.pca')
file_name_pcp = p.with_suffix('.pcp')
file_name_pcr = p.with_suffix('.pcr')
file_name_dtxml = p.with_suffix('.dtxml')
file_name_xlsx = p.parents[1].joinpath('master').with_suffix('.xlsx')
# print('1', p)
# print('2', p.suffix)
# print('3', p.parents[0])
# print('3', p.parents[1])
# print('4', p.stem)
# print('5', p.suffix)
# print(file_name_pca )
# print(file_name_pcp )
# print(file_name_pcr )
# print(file_name_dtxml)
# print(file_name_xlsx )
my_dict = {}
scan_time, datetime_start = extract_meta_from_pcp(file_name_pcp)
my_pcr_dict = extract_meta_from_config(file_name_pcr)
my_pca_dict = extract_meta_from_config(file_name_pca)
my_xml_dict = extract_meta_from_dtxml(file_name_dtxml)
# Dictionary keys assignment. The order of the assigment will sort the xlsx file columns
my_dict['scan date'] = datetime_start
my_dict['Operator'] = my_xml_dict['Operator']
my_dict['Researcher'] = my_xml_dict['Researcher']
my_dict['NMNH PI'] = my_xml_dict['NMNH PI']
my_dict['Department'] = my_xml_dict['Department']
my_dict['CT project #'] = my_xml_dict['Project Number']
my_dict['specimen ID or USNM#'] = my_xml_dict['Sample ID']
my_dict['Species name'] = my_xml_dict['Sample Name']
my_dict['stain'] = my_xml_dict['Description']
try:
my_dict['Sample type'] = my_xml_dict['Sample Type']
except KeyError:
print('Sample type is missing, added empty field')
my_dict['Sample type'] = 'Empty'
my_dict['folder name'] = p.stem
# my_dict['Folder'] = my_pcr_dict['ImageData']['pca_file']
my_dict['timing (ms)'] = my_pca_dict['Detector']['timingval']
my_dict['frame avg'] = my_pca_dict['Detector']['avg']
my_dict['skip'] = my_pca_dict['CT']['skipacc']
my_dict['binning'] = my_pca_dict['Detector']['binning']
my_dict['sensitivity'] = my_pca_dict['Detector']['cameragain']
my_dict['# images'] = my_pca_dict['CT']['numberimages']
my_dict['total scanning time (hrs)'] = str(scan_time)
my_dict['voltage (kV)'] = my_pca_dict['Xray']['voltage']
my_dict['current (uA)'] = my_pca_dict['Xray']['current']
my_dict['act. power (W)'] = "Manual Entry"
my_dict['magnification'] = my_pca_dict['Geometry']['magnification']
my_dict['voxel size (um)'] = float(my_pca_dict['Geometry']['voxelsizex']) * 1000.0
my_dict['tube type'] = my_pca_dict['Xray']['name']
my_dict['mode'] = my_pca_dict['Xray']['mode']
my_dict['multiscan (# of scans)'] = my_pca_dict['Multiscan']['active']
my_dict['filter'] = "Manual Entry"
my_dict['target'] = "Manual Entry"
my_dict['collimator'] = my_pca_dict['Xray']['collimation']
# Specific to Freya
my_dict['Trawl number or collection event ID'] = "Manual Entry"
my_dict['same specimen as Scan x'] = "Manual Entry"
my_dict['specimen pixel width'] = "Manual Entry"
# additional meta data
my_dict['general system name'] = my_pca_dict['General']['systemname']
df = pd.DataFrame(data={**my_dict}, index=[0])
xlsx = pathlib.Path(file_name_xlsx)
if xlsx.is_file():
wb = load_workbook(filename = file_name_xlsx)
ws = wb["Sheet1"]
for r in dataframe_to_rows(df, index=False, header=False): #No index and don't append the column headers
ws.append(r)
wb.save(file_name_xlsx)
# df.to_excel(file_name_xlsx, header=False, index=False)
print('Append to existing meta-data excel file at: %s' % file_name_xlsx)
else:
df.to_excel(file_name_xlsx, header=True, index=False)
print('Create a new meta-data excel file at: %s' % file_name_xlsx)
else:
print('ERROR: %s does not exist' % p)
if __name__ == "__main__":
main(sys.argv)