Updated on July 31, 2022
This Python script uses the tifffile and imagecodecs packages to create a fsspec ReferenceFileSystem file in JSON format for the Earthbigdata set, which consists of 1,033,422 GeoTIFF files stored on AWS. The ReferenceFileSystem is used to create a multi-dimensional Xarray dataset.
See discussion at kerchunk/issues/78.
import os
import base64
import tifffile
import imagecodecs.numcodecs
import matplotlib.pyplot
import numcodecs
import fsspec
import xarray
import zarr
Call the AWS command line app to recursively list all files in the Earthbigdata set. Cache the output in a local file. Filter the list for TIFF files and remove the common path.
if not os.path.exists('earthbigdata.txt'):
os.system(
'aws s3 ls sentinel-1-global-coherence-earthbigdata/data/tiles'
' --recursive > earthbigdata.txt'
)
with open('earthbigdata.txt') as fh:
tiff_files = [
line.split()[-1][11:] for line in fh.readlines() if '.tif' in line
]
print('Number of TIFF files:', len(tiff_files))
Number of TIFF files: 1033422
Define labels, coordinate arrays, file name regular expression patterns, and categories for all dimensions in the Earthbigdata set.
baseurl = (
'https://'
'sentinel-1-global-coherence-earthbigdata.s3.us-west-2.amazonaws.com'
'/data/tiles/'
)
chunkshape = (1200, 1200)
fillvalue = 0
latitude_label = 'latitude'
latitude_pattern = rf'(?P<{latitude_label}>[NS]\d+)'
latitude_coordinates = [
(j * -0.00083333333 - 0.000416666665 + i)
for i in range(82, -79, -1)
for j in range(1200)
]
latitude_category = {}
i = 0
for j in range(82, -1, -1):
latitude_category[f'N{j:-02}'] = i
i += 1
for j in range(1, 79):
latitude_category[f'S{j:-02}'] = i
i += 1
longitude_label = 'longitude'
longitude_pattern = rf'(?P<{longitude_label}>[EW]\d+)'
longitude_coordinates = [
(j * 0.00083333333 + 0.000416666665 + i)
for i in range(-180, 180)
for j in range(1200)
]
longitude_category = {}
i = 0
for j in range(180, 0, -1):
longitude_category[f'W{j:-03}'] = i
i += 1
for j in range(180):
longitude_category[f'E{j:-03}'] = i
i += 1
season_label = 'season'
season_category = {'winter': 0, 'spring': 1, 'summer': 2, 'fall': 3}
season_coordinates = list(season_category.keys())
season_pattern = rf'_(?P<{season_label}>{"|".join(season_category)})'
polarization_label = 'polarization'
polarization_category = {'vv': 0, 'vh': 1, 'hv': 2, 'hh': 3}
polarization_coordinates = list(polarization_category.keys())
polarization_pattern = (
rf'_(?P<{polarization_label}>{"|".join(polarization_category)})'
)
coherence_label = 'coherence'
coherence_category = {
'06': 0,
'12': 1,
'18': 2,
'24': 3,
'36': 4,
'48': 5,
}
coherence_coordinates = list(int(i) for i in coherence_category.keys())
coherence_pattern = (
rf'_COH(?P<{coherence_label}>{"|".join(coherence_category)})'
)
orbit_label = 'orbit'
orbit_coordinates = list(range(1, 176))
orbit_pattern = rf'_(?P<{orbit_label}>\d+)'
flightdirection_label = 'flightdirection'
flightdirection_category = {'A': 0, 'D': 1}
flightdirection_coordinates = list(flightdirection_category.keys())
flightdirection_pattern = (
rf'(?P<{flightdirection_label}>[{"|".join(flightdirection_category)}])_'
)
jsonfile = open('earthbigdata.json', 'w', newline='\n')
Add the coordinate arrays to a Zarr group, convert it to a fsspec ReferenceFileSystem JSON string, and write it to the open file.
coordinates = {} # type: ignore
zarrgroup = zarr.open_group(coordinates)
zarrgroup.array(
longitude_label,
data=longitude_coordinates,
dtype='float32',
# compression='zlib',
).attrs['_ARRAY_DIMENSIONS'] = [longitude_label]
zarrgroup.array(
latitude_label,
data=latitude_coordinates,
dtype='float32',
# compression='zlib',
).attrs['_ARRAY_DIMENSIONS'] = [latitude_label]
zarrgroup.array(
season_label,
data=season_coordinates,
dtype=object,
object_codec=numcodecs.VLenUTF8(),
compression=None,
).attrs['_ARRAY_DIMENSIONS'] = [season_label]
zarrgroup.array(
polarization_label,
data=polarization_coordinates,
dtype=object,
object_codec=numcodecs.VLenUTF8(),
compression=None,
).attrs['_ARRAY_DIMENSIONS'] = [polarization_label]
zarrgroup.array(
coherence_label,
data=coherence_coordinates,
dtype='uint8',
compression=None,
).attrs['_ARRAY_DIMENSIONS'] = [coherence_label]
zarrgroup.array(orbit_label, data=orbit_coordinates, dtype='int32').attrs[
'_ARRAY_DIMENSIONS'
] = [orbit_label]
zarrgroup.array(
flightdirection_label,
data=flightdirection_coordinates,
dtype=object,
object_codec=numcodecs.VLenUTF8(),
compression=None,
).attrs['_ARRAY_DIMENSIONS'] = [flightdirection_label]
# base64 encode any values containing non-ascii characters
for k, v in coordinates.items():
try:
coordinates[k] = v.decode()
except UnicodeDecodeError:
coordinates[k] = 'base64:' + base64.b64encode(v).decode()
coordinates_json = tifffile.ZarrStore._json(coordinates).decode()
jsonfile.write(coordinates_json[:-2]) # skip the last newline and brace
69839
Filter the list of GeoTIFF files for files containing coherence 'COH' data. The regular expression pattern and categories are used to parse the file names for chunk indices.
Note: the created TiffSequence cannot be used to access any files. The file
names do not refer to existing files. The baseurl
is later used to get
the real location of the files.
mode = 'COH'
fileseq = tifffile.TiffSequence(
[file for file in tiff_files if '_' + mode in file],
pattern=(
latitude_pattern
+ longitude_pattern
+ season_pattern
+ polarization_pattern
+ coherence_pattern
),
categories={
latitude_label: latitude_category,
longitude_label: longitude_category,
season_label: season_category,
polarization_label: polarization_category,
coherence_label: coherence_category,
},
)
assert len(fileseq.files) == 444821
assert fileseq.files_missing == 5119339
assert fileseq.shape == (161, 360, 4, 4, 6)
assert fileseq.dims == (
'latitude',
'longitude',
'season',
'polarization',
'coherence',
)
print(fileseq)
TiffSequence N00E005_fall_vv_COH12.tif files: 444821 (5119339 missing) shape: 161, 360, 4, 4, 6 dims: latitude, longitude, season, polarization, coherence
Define axestiled
to tile the latitude and longitude dimensions of the
TiffSequence with the first and second image/chunk dimensions.
Define extra zattrs
to create a Xarray compatible store.
store = fileseq.aszarr(
dtype='uint8',
chunkshape=chunkshape,
fillvalue=fillvalue,
axestiled={0: 0, 1: 1},
zattrs={
'_ARRAY_DIMENSIONS': [
season_label,
polarization_label,
coherence_label,
latitude_label,
longitude_label,
]
},
)
print(store)
ZarrFileSequenceStore shape: 4, 4, 6, 193200, 432000 chunks: 1, 1, 1, 1200, 1200 dtype: uint8 fillvalue: 0
Use the mode name to create a Zarr subgroup.
Use the imagecodecs_tiff
Numcodecs compatible codec for decoding TIFF files.
store.write_fsspec(
jsonfile,
baseurl,
groupname=mode,
codec_id='imagecodecs_tiff',
_append=True,
_close=False,
)
Repeat the TiffSequence->aszarr->write_fsspec
workflow for the other modes.
for mode in (
'AMP',
'tau',
'rmse',
'rho',
):
fileseq = tifffile.TiffSequence(
[file for file in tiff_files if '_' + mode in file],
pattern=(
latitude_pattern
+ longitude_pattern
+ season_pattern
+ polarization_pattern
),
categories={
latitude_label: latitude_category,
longitude_label: longitude_category,
season_label: season_category,
polarization_label: polarization_category,
},
)
print(fileseq)
with fileseq.aszarr(
dtype='uint16',
chunkshape=chunkshape,
fillvalue=fillvalue,
axestiled={0: 0, 1: 1},
zattrs={
'_ARRAY_DIMENSIONS': [
season_label,
polarization_label,
latitude_label,
longitude_label,
]
},
) as store:
print(store)
store.write_fsspec(
jsonfile,
baseurl,
groupname=mode,
codec_id='imagecodecs_tiff',
_append=True,
_close=False,
)
for mode in ('inc', 'lsmap'):
fileseq = tifffile.TiffSequence(
[file for file in tiff_files if '_' + mode in file],
pattern=(
latitude_pattern
+ longitude_pattern
+ orbit_pattern
+ flightdirection_pattern
),
categories={
latitude_label: latitude_category,
longitude_label: longitude_category,
# orbit has no category
flightdirection_label: flightdirection_category,
},
)
print(fileseq)
with fileseq.aszarr(
dtype='uint8',
chunkshape=chunkshape,
fillvalue=fillvalue,
axestiled={0: 0, 1: 1},
zattrs={
'_ARRAY_DIMENSIONS': [
orbit_label,
flightdirection_label,
latitude_label,
longitude_label,
]
},
) as store:
print(store)
store.write_fsspec(
jsonfile,
baseurl,
groupname=mode,
codec_id='imagecodecs_tiff',
_append=True,
_close=mode == 'lsmap', # close after last store
)
TiffSequence N00E005_fall_vh_AMP.tif files: 187693 (739667 missing) shape: 161, 360, 4, 4 dims: latitude, longitude, season, polarization ZarrFileSequenceStore shape: 4, 4, 193200, 432000 chunks: 1, 1, 1200, 1200 dtype: uint16 fillvalue: 0 TiffSequence N00E005_fall_vv_tau.tif files: 98331 (829029 missing) shape: 161, 360, 4, 4 dims: latitude, longitude, season, polarization ZarrFileSequenceStore shape: 4, 4, 193200, 432000 chunks: 1, 1, 1200, 1200 dtype: uint16 fillvalue: 0 TiffSequence N00E005_fall_vv_rmse.tif files: 98331 (829029 missing) shape: 161, 360, 4, 4 dims: latitude, longitude, season, polarization ZarrFileSequenceStore shape: 4, 4, 193200, 432000 chunks: 1, 1, 1200, 1200 dtype: uint16 fillvalue: 0 TiffSequence N00E005_fall_vv_rho.tif files: 98331 (829029 missing) shape: 161, 360, 4, 4 dims: latitude, longitude, season, polarization ZarrFileSequenceStore shape: 4, 4, 193200, 432000 chunks: 1, 1, 1200, 1200 dtype: uint16 fillvalue: 0 TiffSequence N00E005_124D_inc.tif files: 52956 (20233044 missing) shape: 161, 360, 175, 2 dims: latitude, longitude, orbit, flightdirection ZarrFileSequenceStore shape: 175, 2, 193200, 432000 chunks: 1, 1, 1200, 1200 dtype: uint8 fillvalue: 0 TiffSequence N00E005_124D_lsmap.tif files: 52959 (20233041 missing) shape: 161, 360, 175, 2 dims: latitude, longitude, orbit, flightdirection ZarrFileSequenceStore shape: 175, 2, 193200, 432000 chunks: 1, 1, 1200, 1200 dtype: uint8 fillvalue: 0
jsonfile.close()
Register imagecodecs.numcodecs
before using the ReferenceFileSystem.
imagecodecs.numcodecs.register_codecs()
Specify the target_protocol
to load a local file.
mapper = fsspec.get_mapper(
'reference://',
fo='earthbigdata.json',
target_protocol='file',
remote_protocol='https',
)
Use mask_and_scale
to disable conversion to floating point.
dataset = xarray.open_dataset(
mapper,
engine='zarr',
mask_and_scale=False,
backend_kwargs={'consolidated': False},
)
print(dataset)
<xarray.Dataset> Dimensions: (season: 4, polarization: 4, latitude: 193200, longitude: 432000, coherence: 6, flightdirection: 2, orbit: 175) Coordinates: * coherence (coherence) uint8 6 12 18 24 36 48 * flightdirection (flightdirection) object 'A' 'D' * latitude (latitude) float32 82.0 82.0 82.0 ... -79.0 -79.0 -79.0 * longitude (longitude) float32 -180.0 -180.0 -180.0 ... 180.0 180.0 * orbit (orbit) int32 1 2 3 4 5 6 7 ... 169 170 171 172 173 174 175 * polarization (polarization) object 'vv' 'vh' 'hv' 'hh' * season (season) object 'winter' 'spring' 'summer' 'fall' Data variables: AMP (season, polarization, latitude, longitude) uint16 ... COH (season, polarization, coherence, latitude, longitude) uint8 ... inc (orbit, flightdirection, latitude, longitude) uint8 ... lsmap (orbit, flightdirection, latitude, longitude) uint8 ... rho (season, polarization, latitude, longitude) uint16 ... rmse (season, polarization, latitude, longitude) uint16 ... tau (season, polarization, latitude, longitude) uint16 ...
socal = dataset.sel(latitude=slice(36, 32.5), longitude=slice(-121, -115))
print(socal)
<xarray.Dataset> Dimensions: (season: 4, polarization: 4, latitude: 4200, longitude: 7200, coherence: 6, flightdirection: 2, orbit: 175) Coordinates: * coherence (coherence) uint8 6 12 18 24 36 48 * flightdirection (flightdirection) object 'A' 'D' * latitude (latitude) float32 36.0 36.0 36.0 36.0 ... 32.5 32.5 32.5 * longitude (longitude) float32 -121.0 -121.0 -121.0 ... -115.0 -115.0 * orbit (orbit) int32 1 2 3 4 5 6 7 ... 169 170 171 172 173 174 175 * polarization (polarization) object 'vv' 'vh' 'hv' 'hh' * season (season) object 'winter' 'spring' 'summer' 'fall' Data variables: AMP (season, polarization, latitude, longitude) uint16 ... COH (season, polarization, coherence, latitude, longitude) uint8 ... inc (orbit, flightdirection, latitude, longitude) uint8 ... lsmap (orbit, flightdirection, latitude, longitude) uint8 ... rho (season, polarization, latitude, longitude) uint16 ... rmse (season, polarization, latitude, longitude) uint16 ... tau (season, polarization, latitude, longitude) uint16 ...
The few GeoTIFF files comprising the selection are transparently downloaded, decoded, and stitched to an in-memory NumPy array and plotted using Matplotlib.
image = socal['COH'].loc['winter', 'vv', 12]
assert image[100, 100] == 53
xarray.plot.imshow(image, size=6, aspect=1.8)
matplotlib.pyplot.show()