Commit ffe0232e authored by Christian Weymann's avatar Christian Weymann
Browse files

finish data loading block

parent f706632b
......@@ -9,7 +9,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
......@@ -34,132 +34,155 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 42,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The following measurements were converted previously and have been ignored:\n",
"m21003b_topo_MC_009\n",
"m21004DSOJ4_topo_MC_005\n",
"m21004DSOJ4_topo_MC_008\n",
"m21005b_topo_MC_006\n",
"m21005b_topo_MC_009\n",
"m21006DSOI2_topo_MC_005\n",
"m21007b_topo_MC_003\n",
"m20036_topo_MC_006\n",
"m20036_topo_MC_007\n",
"m20036_topo_MC_008\n",
"m20037_topo_MC_003\n",
"m20037_topo_MC_004\n",
"m20038_topo_MC_006\n",
"m20038_topo_MC_007\n",
"m20038_topo_MC_010\n",
"m20039_topo_MC_010\n",
"m20039_topo_MC_011\n",
"m20041_topo_MC_006\n",
"m20041_topo_MC_007\n",
"m21003DSOJ2_topo_MC_006\n",
"m21003DSOJ2_topo_MC_009\n",
"m21005DSOF2_topo_MC_005\n",
"m21005DSOF2_topo_MC_008\n",
"m21007DSOC3_topo_MC_005\n",
"\n",
"\n",
"The following files were skipped:\n",
"data\\to_convert\\\n",
"data\\to_convert\\old\n",
"If any of these files should have been included, check the allowed extensions.\n"
]
}
],
"source": [
"allowed_extensions = set(('.idw', '.sxm'))\n",
"raw_data_path = os.path.join('data', 'to_convert')\n",
"data_path = 'data'\n",
"skipped_list = []\n",
"already_converted = []\n",
"to_convert = {}\n",
"\n",
"for fn in glob(os.path.join(raw_data_path), '**', recursive=True):\n",
"for fn in glob(os.path.join(raw_data_path, '**'), recursive=True):\n",
" \n",
" sample_name = os.path.basename(fn).split('_')[0]\n",
" #skip empty filenames\n",
" if sample_name == '':\n",
" #skip anything that doesn't have an allowed extension\n",
" if os.path.splitext(fn)[1] not in allowed_extensions:\n",
" skipped_list.append(fn)\n",
" continue\n",
" #what to do when we can't convert the file at hand?\n",
" sample_path = os.path.join(data_path, sample_name+'.hdf5')\n",
" #check if the sample file exists and create it if not\n",
" if not os.path.isfile(sample_path):\n",
" #create sample file\n",
" #check if sample thickness is given\n",
" #check if the measurement is present in the sample file"
" \n",
" measurement_name = os.path.splitext(os.path.basename(fn))[0]\n",
" sample_name = measurement_name.split('_')[0]\n",
" combined_name = os.path.join(data_path, sample_name)\n",
" \n",
" #check if the sample file exists \n",
" if os.path.isfile(combined_name+'.hdf5'):\n",
" with h5py.File(combined_name+'.hdf5', 'r') as f:\n",
" #check if the measurement is already there\n",
" if f'datasets/{measurement_name}' in f:\n",
" already_converted.append(measurement_name)\n",
" continue\n",
" \n",
" #Since the measurement is not there, add it to the list to convert\n",
" to_convert.setdefault(combined_name, []).append(fn.replace('\\\\', '/')) \n",
" \n",
"for combined_name, filelist in to_convert.items():\n",
" ms.io.read_file.merge_hdf5(filelist, combined_name)\n",
"\n",
"for combined_name in to_convert:\n",
" sample_name = os.path.basename(combine_name)\n",
" with h5py.File(combined_name+'.hdf5', 'r+') as f:\n",
" #check if sample thickness is given, ask for it if not\n",
" if f['datasets'].attrs.get('Thickness_uc') is None:\n",
" try:\n",
" f['datasets'].attrs.create('Thickness_uc',\n",
" int(input(f'What is the thickness of {sample_name} in uc?')))\n",
" except ValueError:\n",
" pass\n",
" \n",
" \n",
"print('The following measurements were converted previously and have been ignored:')\n",
"for mn in already_converted: print(mn)\n",
"print('\\n')\n",
"\n",
"print('The following files were skipped:')\n",
"for fn in skipped_list: print(fn)\n",
"print('If any of these files should have been included, check the allowed extensions.')"
]
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'text.txt'"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
"name": "stdout",
"output_type": "stream",
"text": [
"data/Celine\n",
"data/LoopsCeline-color.png\n",
"data/m20036.hdf5\n",
"data/m20037.hdf5\n",
"data/m20038.hdf5\n",
"data/m20039.hdf5\n",
"data/m20041.hdf5\n",
"data/m21003b.hdf5\n",
"data/m21003DSOJ2.hdf5\n",
"data/m21004DSOJ4.hdf5\n",
"data/m21005b.hdf5\n",
"data/m21005b_topo_MC_006.sxm\n",
"data/m21005DSOF2.hdf5\n",
"data/m21006DSOI2.hdf5\n",
"data/m21007b.hdf5\n",
"data/m21007DSOC3.hdf5\n",
"data/test.txt\n",
"data/to_convert\n"
]
}
],
"source": [
"os.path.basename('/foo/bar/text.txt').split('_')[0]"
"for fn in glob('data/*'):\n",
" print(fn.replace('\\\\', '/'))"
]
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['data\\\\',\n",
" 'data\\\\Celine',\n",
" 'data\\\\Celine\\\\20201123 Zoom Meeting Ankit.pdf',\n",
" 'data\\\\Celine\\\\m20053',\n",
" 'data\\\\Celine\\\\m20053\\\\SSPFM53_01.ARDF',\n",
" 'data\\\\Celine\\\\m20053\\\\SSPFM53_01.hdf5',\n",
" 'data\\\\Celine\\\\m20053\\\\SSPFM_00.ARDF',\n",
" 'data\\\\Celine\\\\m20053\\\\SSPFM_00.hdf5',\n",
" 'data\\\\Celine\\\\m20053.zip',\n",
" 'data\\\\Celine\\\\m20054',\n",
" 'data\\\\Celine\\\\m20054\\\\SSPFM_00.ARDF',\n",
" 'data\\\\Celine\\\\m20054\\\\SSPFM_00.hdf5',\n",
" 'data\\\\Celine\\\\m20054.zip',\n",
" 'data\\\\Celine\\\\m20055',\n",
" 'data\\\\Celine\\\\m20055\\\\SSPFM55_00.ARDF',\n",
" 'data\\\\Celine\\\\m20055\\\\SSPFM55_00.hdf5',\n",
" 'data\\\\Celine\\\\m20055\\\\SSPFM55_01.ARDF',\n",
" 'data\\\\Celine\\\\m20055\\\\SSPFM55_01.hdf5',\n",
" 'data\\\\Celine\\\\m20055\\\\SSPFM_02.ARDF',\n",
" 'data\\\\Celine\\\\m20055\\\\SSPFM_02.hdf5',\n",
" 'data\\\\Celine\\\\m20055.zip',\n",
" 'data\\\\Celine\\\\m20056',\n",
" 'data\\\\Celine\\\\m20056\\\\SSPFM56_00.ARDF',\n",
" 'data\\\\Celine\\\\m20056\\\\SSPFM56_00.hdf5',\n",
" 'data\\\\Celine\\\\m20057',\n",
" 'data\\\\Celine\\\\m20057\\\\SSPFM57_00.ARDF',\n",
" 'data\\\\Celine\\\\m20057\\\\SSPFM57_00.hdf5',\n",
" 'data\\\\Celine\\\\m20058',\n",
" 'data\\\\Celine\\\\m20058\\\\SSPFM_00.ARDF',\n",
" 'data\\\\Celine\\\\m20058\\\\SSPFM_00.hdf5',\n",
" 'data\\\\Celine\\\\m20058\\\\SSPFM_01.ARDF',\n",
" 'data\\\\Celine\\\\m20058\\\\SSPFM_01.hdf5',\n",
" 'data\\\\Celine\\\\m20058.zip',\n",
" 'data\\\\Celine\\\\m20059',\n",
" 'data\\\\Celine\\\\m20059\\\\SSPFM59_00.ARDF',\n",
" 'data\\\\Celine\\\\m20059\\\\SSPFM59_00.hdf5',\n",
" 'data\\\\Celine\\\\m20059\\\\SSPFM59_01.ARDF',\n",
" 'data\\\\Celine\\\\m20059\\\\SSPFM59_01.hdf5',\n",
" 'data\\\\Celine\\\\m20059.zip',\n",
" 'data\\\\LoopsCeline-color.png',\n",
" 'data\\\\test.txt',\n",
" 'data\\\\to_convert',\n",
" 'data\\\\to_convert\\\\m21003b_topo_MC_009.sxm',\n",
" 'data\\\\to_convert\\\\m21004DSOJ4_topo_MC_005.sxm',\n",
" 'data\\\\to_convert\\\\m21004DSOJ4_topo_MC_008.sxm',\n",
" 'data\\\\to_convert\\\\m21005b_topo_MC_006.sxm',\n",
" 'data\\\\to_convert\\\\m21005b_topo_MC_009.sxm',\n",
" 'data\\\\to_convert\\\\m21006DSOI2_topo_MC_005.sxm',\n",
" 'data\\\\to_convert\\\\m21007b_topo_MC_003.sxm',\n",
" 'data\\\\to_convert\\\\old',\n",
" 'data\\\\to_convert\\\\old\\\\m20036_topo_MC_006.sxm',\n",
" 'data\\\\to_convert\\\\old\\\\m20036_topo_MC_007.sxm',\n",
" 'data\\\\to_convert\\\\old\\\\m20036_topo_MC_008.sxm',\n",
" 'data\\\\to_convert\\\\old\\\\m20037_topo_MC_003.sxm',\n",
" 'data\\\\to_convert\\\\old\\\\m20037_topo_MC_004.sxm',\n",
" 'data\\\\to_convert\\\\old\\\\m20038_topo_MC_006.sxm',\n",
" 'data\\\\to_convert\\\\old\\\\m20038_topo_MC_007.sxm',\n",
" 'data\\\\to_convert\\\\old\\\\m20038_topo_MC_010.sxm',\n",
" 'data\\\\to_convert\\\\old\\\\m20039_topo_MC_010.sxm',\n",
" 'data\\\\to_convert\\\\old\\\\m20039_topo_MC_011.sxm',\n",
" 'data\\\\to_convert\\\\old\\\\m20041_topo_MC_006.sxm',\n",
" 'data\\\\to_convert\\\\old\\\\m20041_topo_MC_007.sxm',\n",
" 'data\\\\to_convert\\\\old\\\\m21003DSOJ2_topo_MC_006.sxm',\n",
" 'data\\\\to_convert\\\\old\\\\m21003DSOJ2_topo_MC_009.sxm',\n",
" 'data\\\\to_convert\\\\old\\\\m21005DSOF2_topo_MC_005.sxm',\n",
" 'data\\\\to_convert\\\\old\\\\m21005DSOF2_topo_MC_008.sxm',\n",
" 'data\\\\to_convert\\\\old\\\\m21007DSOC3_topo_MC_005.sxm']"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
"name": "stdout",
"output_type": "stream",
"text": [
"m20038_topo_MC_006\n",
"m20038_topo_MC_007\n",
"m20038_topo_MC_010\n"
]
}
],
"source": [
"glob('data/**', recursive=True)"
"with h5py.File('data/m20038.hdf5', 'r') as f:\n",
" for key in f['datasets']: print(key)\n",
" "
]
},
{
......
%% Cell type:markdown id: tags:
General purpose imports
%% Cell type:code id: tags:
``` python
import numpy as np
import matplotlib.pyplot as plt
import multiscale as ms
import h5py
import os
from glob import glob
from IPython.display import clear_output
```
%% Cell type:markdown id: tags:
Next, we load all the data in `raw_data_path` into the `.hdf5` format and store it in `data_path`. We make one `.hdf5` file per sample, by assuming the raw data filenames start with the sample name followed by an underscore.
%% Cell type:code id: tags:
``` python
allowed_extensions = set(('.idw', '.sxm'))
raw_data_path = os.path.join('data', 'to_convert')
data_path = 'data'
for fn in glob(os.path.join(raw_data_path), '**', recursive=True):
sample_name = os.path.basename(fn).split('_')[0]
#skip empty filenames
if sample_name == '':
skipped_list = []
already_converted = []
to_convert = {}
for fn in glob(os.path.join(raw_data_path, '**'), recursive=True):
#skip anything that doesn't have an allowed extension
if os.path.splitext(fn)[1] not in allowed_extensions:
skipped_list.append(fn)
continue
#what to do when we can't convert the file at hand?
sample_path = os.path.join(data_path, sample_name+'.hdf5')
#check if the sample file exists and create it if not
if not os.path.isfile(sample_path):
#create sample file
#check if sample thickness is given
#check if the measurement is present in the sample file
measurement_name = os.path.splitext(os.path.basename(fn))[0]
sample_name = measurement_name.split('_')[0]
combined_name = os.path.join(data_path, sample_name)
#check if the sample file exists
if os.path.isfile(combined_name+'.hdf5'):
with h5py.File(combined_name+'.hdf5', 'r') as f:
#check if the measurement is already there
if f'datasets/{measurement_name}' in f:
already_converted.append(measurement_name)
continue
#Since the measurement is not there, add it to the list to convert
to_convert.setdefault(combined_name, []).append(fn.replace('\\', '/'))
for combined_name, filelist in to_convert.items():
ms.io.read_file.merge_hdf5(filelist, combined_name)
for combined_name in to_convert:
sample_name = os.path.basename(combine_name)
with h5py.File(combined_name+'.hdf5', 'r+') as f:
#check if sample thickness is given, ask for it if not
if f['datasets'].attrs.get('Thickness_uc') is None:
try:
f['datasets'].attrs.create('Thickness_uc',
int(input(f'What is the thickness of {sample_name} in uc?')))
except ValueError:
pass
print('The following measurements were converted previously and have been ignored:')
for mn in already_converted: print(mn)
print('\n')
print('The following files were skipped:')
for fn in skipped_list: print(fn)
print('If any of these files should have been included, check the allowed extensions.')
```
%%%% Output: stream
The following measurements were converted previously and have been ignored:
m21003b_topo_MC_009
m21004DSOJ4_topo_MC_005
m21004DSOJ4_topo_MC_008
m21005b_topo_MC_006
m21005b_topo_MC_009
m21006DSOI2_topo_MC_005
m21007b_topo_MC_003
m20036_topo_MC_006
m20036_topo_MC_007
m20036_topo_MC_008
m20037_topo_MC_003
m20037_topo_MC_004
m20038_topo_MC_006
m20038_topo_MC_007
m20038_topo_MC_010
m20039_topo_MC_010
m20039_topo_MC_011
m20041_topo_MC_006
m20041_topo_MC_007
m21003DSOJ2_topo_MC_006
m21003DSOJ2_topo_MC_009
m21005DSOF2_topo_MC_005
m21005DSOF2_topo_MC_008
m21007DSOC3_topo_MC_005
The following files were skipped:
data\to_convert\
data\to_convert\old
If any of these files should have been included, check the allowed extensions.
%% Cell type:code id: tags:
``` python
os.path.basename('/foo/bar/text.txt').split('_')[0]
for fn in glob('data/*'):
print(fn.replace('\\', '/'))
```
%%%% Output: execute_result
%%%% Output: stream
'text.txt'
data/Celine
data/LoopsCeline-color.png
data/m20036.hdf5
data/m20037.hdf5
data/m20038.hdf5
data/m20039.hdf5
data/m20041.hdf5
data/m21003b.hdf5
data/m21003DSOJ2.hdf5
data/m21004DSOJ4.hdf5
data/m21005b.hdf5
data/m21005b_topo_MC_006.sxm
data/m21005DSOF2.hdf5
data/m21006DSOI2.hdf5
data/m21007b.hdf5
data/m21007DSOC3.hdf5
data/test.txt
data/to_convert
%% Cell type:code id: tags:
``` python
glob('data/**', recursive=True)
with h5py.File('data/m20038.hdf5', 'r') as f:
for key in f['datasets']: print(key)
```
%%%% Output: execute_result
%%%% Output: stream
['data\\',
'data\\Celine',
'data\\Celine\\20201123 Zoom Meeting Ankit.pdf',
'data\\Celine\\m20053',
'data\\Celine\\m20053\\SSPFM53_01.ARDF',
'data\\Celine\\m20053\\SSPFM53_01.hdf5',
'data\\Celine\\m20053\\SSPFM_00.ARDF',
'data\\Celine\\m20053\\SSPFM_00.hdf5',
'data\\Celine\\m20053.zip',
'data\\Celine\\m20054',
'data\\Celine\\m20054\\SSPFM_00.ARDF',
'data\\Celine\\m20054\\SSPFM_00.hdf5',
'data\\Celine\\m20054.zip',
'data\\Celine\\m20055',
'data\\Celine\\m20055\\SSPFM55_00.ARDF',
'data\\Celine\\m20055\\SSPFM55_00.hdf5',
'data\\Celine\\m20055\\SSPFM55_01.ARDF',
'data\\Celine\\m20055\\SSPFM55_01.hdf5',
'data\\Celine\\m20055\\SSPFM_02.ARDF',
'data\\Celine\\m20055\\SSPFM_02.hdf5',
'data\\Celine\\m20055.zip',
'data\\Celine\\m20056',
'data\\Celine\\m20056\\SSPFM56_00.ARDF',
'data\\Celine\\m20056\\SSPFM56_00.hdf5',
'data\\Celine\\m20057',
'data\\Celine\\m20057\\SSPFM57_00.ARDF',
'data\\Celine\\m20057\\SSPFM57_00.hdf5',
'data\\Celine\\m20058',
'data\\Celine\\m20058\\SSPFM_00.ARDF',
'data\\Celine\\m20058\\SSPFM_00.hdf5',
'data\\Celine\\m20058\\SSPFM_01.ARDF',
'data\\Celine\\m20058\\SSPFM_01.hdf5',
'data\\Celine\\m20058.zip',
'data\\Celine\\m20059',
'data\\Celine\\m20059\\SSPFM59_00.ARDF',
'data\\Celine\\m20059\\SSPFM59_00.hdf5',
'data\\Celine\\m20059\\SSPFM59_01.ARDF',
'data\\Celine\\m20059\\SSPFM59_01.hdf5',
'data\\Celine\\m20059.zip',
'data\\LoopsCeline-color.png',
'data\\test.txt',
'data\\to_convert',
'data\\to_convert\\m21003b_topo_MC_009.sxm',
'data\\to_convert\\m21004DSOJ4_topo_MC_005.sxm',
'data\\to_convert\\m21004DSOJ4_topo_MC_008.sxm',
'data\\to_convert\\m21005b_topo_MC_006.sxm',
'data\\to_convert\\m21005b_topo_MC_009.sxm',
'data\\to_convert\\m21006DSOI2_topo_MC_005.sxm',
'data\\to_convert\\m21007b_topo_MC_003.sxm',
'data\\to_convert\\old',
'data\\to_convert\\old\\m20036_topo_MC_006.sxm',
'data\\to_convert\\old\\m20036_topo_MC_007.sxm',
'data\\to_convert\\old\\m20036_topo_MC_008.sxm',
'data\\to_convert\\old\\m20037_topo_MC_003.sxm',
'data\\to_convert\\old\\m20037_topo_MC_004.sxm',
'data\\to_convert\\old\\m20038_topo_MC_006.sxm',
'data\\to_convert\\old\\m20038_topo_MC_007.sxm',
'data\\to_convert\\old\\m20038_topo_MC_010.sxm',
'data\\to_convert\\old\\m20039_topo_MC_010.sxm',
'data\\to_convert\\old\\m20039_topo_MC_011.sxm',
'data\\to_convert\\old\\m20041_topo_MC_006.sxm',
'data\\to_convert\\old\\m20041_topo_MC_007.sxm',
'data\\to_convert\\old\\m21003DSOJ2_topo_MC_006.sxm',
'data\\to_convert\\old\\m21003DSOJ2_topo_MC_009.sxm',
'data\\to_convert\\old\\m21005DSOF2_topo_MC_005.sxm',
'data\\to_convert\\old\\m21005DSOF2_topo_MC_008.sxm',
'data\\to_convert\\old\\m21007DSOC3_topo_MC_005.sxm']
m20038_topo_MC_006
m20038_topo_MC_007
m20038_topo_MC_010
%% Cell type:code id: tags:
``` python
```
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment