Create a fsspec ReferenceFileSystem for a large set of remote GeoTIFF files¶
Published Oct 9, 2021. Last updated May 21, 2025.
This Python script uses the tifffile and imagecodecs packages to create a fsspec ReferenceFileSystem file in JSON format for the Earthbigdata set, which consists of 1,033,422 GeoTIFF files stored on AWS. The ReferenceFileSystem is used to create a multi-dimensional Xarray dataset.
Refer to the discussion at kerchunk/issues/78.
Source code is available in the tifffile repository.
import base64
import os
import imagecodecs.numcodecs
import kerchunk.utils
import matplotlib.pyplot
import tifffile
import tifffile.zarr
import xarray
import zarr # >= 3
Get a list of all remote TIFF files¶
Call the AWS command line app to recursively list all files in the Earthbigdata set. Cache the output in a local file. Filter the list for TIFF files and remove the common path.
if not os.path.exists('earthbigdata.txt'):
os.system(
'aws s3 ls sentinel-1-global-coherence-earthbigdata/data/tiles'
' --recursive > earthbigdata.txt'
)
with open('earthbigdata.txt', encoding='utf-8') as fh:
tiff_files = [
line.split()[-1][11:] for line in fh.readlines() if '.tif' in line
]
print('Number of TIFF files:', len(tiff_files))
Number of TIFF files: 1033422
Define metadata to describe the dataset¶
Define labels, coordinate arrays, file name regular expression patterns, and categories for all dimensions in the Earthbigdata set.
baseurl = (
'https://'
'sentinel-1-global-coherence-earthbigdata.s3.us-west-2.amazonaws.com'
'/data/tiles/'
)
chunkshape = (1200, 1200)
fillvalue = 0
latitude_label = 'latitude'
latitude_pattern = rf'(?P<{latitude_label}>[NS]\d+)'
latitude_coordinates = [
(j * -0.00083333333 - 0.000416666665 + i)
for i in range(82, -79, -1)
for j in range(1200)
]
latitude_category = {}
i = 0
for j in range(82, -1, -1):
latitude_category[f'N{j:-02}'] = i
i += 1
for j in range(1, 79):
latitude_category[f'S{j:-02}'] = i
i += 1
longitude_label = 'longitude'
longitude_pattern = rf'(?P<{longitude_label}>[EW]\d+)'
longitude_coordinates = [
(j * 0.00083333333 + 0.000416666665 + i)
for i in range(-180, 180)
for j in range(1200)
]
longitude_category = {}
i = 0
for j in range(180, 0, -1):
longitude_category[f'W{j:-03}'] = i
i += 1
for j in range(180):
longitude_category[f'E{j:-03}'] = i
i += 1
season_label = 'season'
season_category = {'winter': 0, 'spring': 1, 'summer': 2, 'fall': 3}
season_coordinates = list(season_category.keys())
season_pattern = rf'_(?P<{season_label}>{"|".join(season_category)})'
polarization_label = 'polarization'
polarization_category = {'vv': 0, 'vh': 1, 'hv': 2, 'hh': 3}
polarization_coordinates = list(polarization_category.keys())
polarization_pattern = (
rf'_(?P<{polarization_label}>{"|".join(polarization_category)})'
)
coherence_label = 'coherence'
coherence_category = {
'06': 0,
'12': 1,
'18': 2,
'24': 3,
'36': 4,
'48': 5,
}
coherence_coordinates = list(int(i) for i in coherence_category.keys())
coherence_pattern = (
rf'_COH(?P<{coherence_label}>{"|".join(coherence_category)})'
)
orbit_label = 'orbit'
orbit_coordinates = list(range(1, 176))
orbit_pattern = rf'_(?P<{orbit_label}>\d+)'
flightdirection_label = 'flightdirection'
flightdirection_category = {'A': 0, 'D': 1}
flightdirection_coordinates = list(flightdirection_category.keys())
flightdirection_pattern = (
rf'(?P<{flightdirection_label}>[{"|".join(flightdirection_category)}])_'
)
Open a file for writing the fsspec ReferenceFileSystem in JSON format¶
jsonfile = open('earthbigdata.json', 'w', encoding='utf-8', newline='\n')
Write the coordinate arrays¶
Add the coordinate arrays to a Zarr group, convert it to a fsspec ReferenceFileSystem JSON string, and write it to the open file.
coordinates = {} # type: ignore[var-annotated]
zarrgroup = zarr.open_group(coordinates, use_consolidated=False, zarr_format=2)
zararray = zarrgroup.create_array(
longitude_label,
shape=[len(longitude_coordinates)],
dtype='float32',
# compressor='zlib',
)
zararray[:] = longitude_coordinates
zararray.attrs['_ARRAY_DIMENSIONS'] = [longitude_label]
zararray = zarrgroup.create_array(
latitude_label,
shape=[len(latitude_coordinates)],
dtype='float32',
# compressor='zlib',
)
zararray[:] = latitude_coordinates
zararray.attrs['_ARRAY_DIMENSIONS'] = [latitude_label]
zararray = zarrgroup.create_array(
season_label,
shape=[len(season_coordinates)],
dtype='|U6',
compressor=None,
)
zararray[:] = season_coordinates
zararray.attrs['_ARRAY_DIMENSIONS'] = [season_label]
zararray = zarrgroup.create_array(
polarization_label,
shape=[len(polarization_coordinates)],
dtype='|U2',
compressor=None,
)
zararray[:] = polarization_coordinates
zararray.attrs['_ARRAY_DIMENSIONS'] = [polarization_label]
zararray = zarrgroup.create_array(
coherence_label,
shape=[len(coherence_coordinates)],
dtype='uint8',
compressor=None,
)
zararray[:] = coherence_coordinates
zararray.attrs['_ARRAY_DIMENSIONS'] = [coherence_label]
zararray = zarrgroup.create_array(
orbit_label, shape=[len(orbit_coordinates)], dtype='int32'
)
zararray[:] = orbit_coordinates
zararray.attrs['_ARRAY_DIMENSIONS'] = [orbit_label]
zararray = zarrgroup.create_array(
flightdirection_label,
shape=[len(flightdirection_coordinates)],
dtype='|U1',
compressor=None,
)
zararray[:] = flightdirection_coordinates
zararray.attrs['_ARRAY_DIMENSIONS'] = [flightdirection_label]
# base64 encode any values containing non-ascii characters
for k, v in coordinates.items():
v = v.to_bytes()
try:
coordinates[k] = v.decode()
except UnicodeDecodeError:
coordinates[k] = 'base64:' + base64.b64encode(v).decode()
coordinates_json = tifffile.zarr._json_dumps(coordinates).decode()
jsonfile.write(coordinates_json[:-2]) # skip the last newline and brace
69746
Create a TiffSequence from a list of file names¶
Filter the list of GeoTIFF files for files containing coherence 'COH' data. The regular expression pattern and categories are used to parse the file names for chunk indices.
Note: the created TiffSequence cannot be used to access any files. The file
names do not refer to existing files. The baseurl
is later used to get
the real location of the files.
mode = 'COH'
fileseq = tifffile.TiffSequence(
[file for file in tiff_files if '_' + mode in file],
pattern=(
latitude_pattern
+ longitude_pattern
+ season_pattern
+ polarization_pattern
+ coherence_pattern
),
categories={
latitude_label: latitude_category,
longitude_label: longitude_category,
season_label: season_category,
polarization_label: polarization_category,
coherence_label: coherence_category,
},
)
assert len(fileseq) == 444821
assert fileseq.files_missing == 5119339
assert fileseq.shape == (161, 360, 4, 4, 6)
assert fileseq.dims == (
'latitude',
'longitude',
'season',
'polarization',
'coherence',
)
print(fileseq)
TiffSequence N00E005_fall_vv_COH12.tif files: 444821 (5119339 missing) shape: 161, 360, 4, 4, 6 dims: latitude, longitude, season, polarization, coherence
Create a ZarrTiffStore from the TiffSequence¶
Define axestiled
to tile the latitude and longitude dimensions of the
TiffSequence with the first and second image/chunk dimensions.
Define extra zattrs
to create a Xarray compatible store.
store = fileseq.aszarr(
chunkdtype='uint8',
chunkshape=chunkshape,
fillvalue=fillvalue,
axestiled={0: 0, 1: 1},
zattrs={
'_ARRAY_DIMENSIONS': [
season_label,
polarization_label,
coherence_label,
latitude_label,
longitude_label,
]
},
)
print(store)
ZarrFileSequenceStore
Append the ZarrTiffStore to the open ReferenceFileSystem file¶
Use the mode name to create a Zarr subgroup.
Use the imagecodecs_tiff
Numcodecs compatible codec for decoding TIFF files.
store.write_fsspec(
jsonfile,
baseurl,
groupname=mode,
codec_id='imagecodecs_tiff',
_append=True,
_close=False,
)
Repeat for the other modes¶
Repeat the TiffSequence->aszarr->write_fsspec
workflow for the other modes.
for mode in (
'AMP',
'tau',
'rmse',
'rho',
):
fileseq = tifffile.TiffSequence(
[file for file in tiff_files if '_' + mode in file],
pattern=(
latitude_pattern
+ longitude_pattern
+ season_pattern
+ polarization_pattern
),
categories={
latitude_label: latitude_category,
longitude_label: longitude_category,
season_label: season_category,
polarization_label: polarization_category,
},
)
print(fileseq)
with fileseq.aszarr(
chunkdtype='uint16',
chunkshape=chunkshape,
fillvalue=fillvalue,
axestiled={0: 0, 1: 1},
zattrs={
'_ARRAY_DIMENSIONS': [
season_label,
polarization_label,
latitude_label,
longitude_label,
]
},
) as store:
print(store)
store.write_fsspec(
jsonfile,
baseurl,
groupname=mode,
codec_id='imagecodecs_tiff',
_append=True,
_close=False,
)
for mode in ('inc', 'lsmap'):
fileseq = tifffile.TiffSequence(
[file for file in tiff_files if '_' + mode in file],
pattern=(
latitude_pattern
+ longitude_pattern
+ orbit_pattern
+ flightdirection_pattern
),
categories={
latitude_label: latitude_category,
longitude_label: longitude_category,
# orbit has no category
flightdirection_label: flightdirection_category,
},
)
print(fileseq)
with fileseq.aszarr(
chunkdtype='uint8',
chunkshape=chunkshape,
fillvalue=fillvalue,
axestiled={0: 0, 1: 1},
zattrs={
'_ARRAY_DIMENSIONS': [
orbit_label,
flightdirection_label,
latitude_label,
longitude_label,
]
},
) as store:
print(store)
store.write_fsspec(
jsonfile,
baseurl,
groupname=mode,
codec_id='imagecodecs_tiff',
_append=True,
_close=mode == 'lsmap', # close after last store
)
TiffSequence N00E005_fall_vh_AMP.tif files: 187693 (739667 missing) shape: 161, 360, 4, 4 dims: latitude, longitude, season, polarization
ZarrFileSequenceStore
TiffSequence N00E005_fall_vv_tau.tif files: 98331 (829029 missing) shape: 161, 360, 4, 4 dims: latitude, longitude, season, polarization ZarrFileSequenceStore
TiffSequence N00E005_fall_vv_rmse.tif files: 98331 (829029 missing) shape: 161, 360, 4, 4 dims: latitude, longitude, season, polarization ZarrFileSequenceStore
TiffSequence N00E005_fall_vv_rho.tif files: 98331 (829029 missing) shape: 161, 360, 4, 4 dims: latitude, longitude, season, polarization
ZarrFileSequenceStore
TiffSequence N00E005_124D_inc.tif files: 52956 (20233044 missing) shape: 161, 360, 175, 2 dims: latitude, longitude, orbit, flightdirection ZarrFileSequenceStore
TiffSequence N00E005_124D_lsmap.tif files: 52959 (20233041 missing) shape: 161, 360, 175, 2 dims: latitude, longitude, orbit, flightdirection ZarrFileSequenceStore
Close the JSON file¶
jsonfile.close()
Use the fsspec ReferenceFileSystem file to create a Xarray dataset¶
Register imagecodecs.numcodecs
before using the ReferenceFileSystem.
imagecodecs.numcodecs.register_codecs()
Create a Xarray dataset from the ReferenceFileSystem file¶
Use mask_and_scale
to disable conversion to floating point.
store = kerchunk.utils.refs_as_store('earthbigdata.json')
dataset = xarray.open_zarr(store, consolidated=False, mask_and_scale=False)
print(dataset)
<xarray.Dataset> Size: 77TB Dimensions: (coherence: 6, flightdirection: 2, latitude: 193200, longitude: 432000, orbit: 175, polarization: 4, season: 4) Coordinates: * coherence (coherence) uint8 6B 6 12 18 24 36 48 * flightdirection (flightdirection) <U1 8B 'A' 'D' * latitude (latitude) float32 773kB 82.0 82.0 82.0 ... -79.0 -79.0 * longitude (longitude) float32 2MB -180.0 -180.0 ... 180.0 180.0 * orbit (orbit) int32 700B 1 2 3 4 5 6 ... 170 171 172 173 174 175 * polarization (polarization) <U2 32B 'vv' 'vh' 'hv' 'hh' * season (season) <U6 96B 'winter' 'spring' 'summer' 'fall' Data variables: COH (season, polarization, coherence, latitude, longitude) uint8 8TB dask.array<chunksize=(1, 1, 1, 1200, 1200), meta=np.ndarray> AMP (season, polarization, latitude, longitude) uint16 3TB dask.array<chunksize=(1, 1, 1200, 1200), meta=np.ndarray> tau (season, polarization, latitude, longitude) uint16 3TB dask.array<chunksize=(1, 1, 1200, 1200), meta=np.ndarray> rmse (season, polarization, latitude, longitude) uint16 3TB dask.array<chunksize=(1, 1, 1200, 1200), meta=np.ndarray> rho (season, polarization, latitude, longitude) uint16 3TB dask.array<chunksize=(1, 1, 1200, 1200), meta=np.ndarray> inc (orbit, flightdirection, latitude, longitude) uint8 29TB dask.array<chunksize=(1, 1, 1200, 1200), meta=np.ndarray> lsmap (orbit, flightdirection, latitude, longitude) uint8 29TB dask.array<chunksize=(1, 1, 1200, 1200), meta=np.ndarray>
Select the Southern California region in the dataset¶
socal = dataset.sel(latitude=slice(36, 32.5), longitude=slice(-121, -115))
print(socal)
<xarray.Dataset> Size: 28GB Dimensions: (coherence: 6, flightdirection: 2, latitude: 4200, longitude: 7200, orbit: 175, polarization: 4, season: 4) Coordinates: * coherence (coherence) uint8 6B 6 12 18 24 36 48 * flightdirection (flightdirection) <U1 8B 'A' 'D' * latitude (latitude) float32 17kB 36.0 36.0 36.0 ... 32.5 32.5 32.5 * longitude (longitude) float32 29kB -121.0 -121.0 ... -115.0 -115.0 * orbit (orbit) int32 700B 1 2 3 4 5 6 ... 170 171 172 173 174 175 * polarization (polarization) <U2 32B 'vv' 'vh' 'hv' 'hh' * season (season) <U6 96B 'winter' 'spring' 'summer' 'fall' Data variables: COH (season, polarization, coherence, latitude, longitude) uint8 3GB dask.array<chunksize=(1, 1, 1, 1200, 1200), meta=np.ndarray> AMP (season, polarization, latitude, longitude) uint16 968MB dask.array<chunksize=(1, 1, 1200, 1200), meta=np.ndarray> tau (season, polarization, latitude, longitude) uint16 968MB dask.array<chunksize=(1, 1, 1200, 1200), meta=np.ndarray> rmse (season, polarization, latitude, longitude) uint16 968MB dask.array<chunksize=(1, 1, 1200, 1200), meta=np.ndarray> rho (season, polarization, latitude, longitude) uint16 968MB dask.array<chunksize=(1, 1, 1200, 1200), meta=np.ndarray> inc (orbit, flightdirection, latitude, longitude) uint8 11GB dask.array<chunksize=(1, 1, 1200, 1200), meta=np.ndarray> lsmap (orbit, flightdirection, latitude, longitude) uint8 11GB dask.array<chunksize=(1, 1, 1200, 1200), meta=np.ndarray>
Plot a selection of the dataset¶
The few GeoTIFF files comprising the selection are transparently downloaded, decoded, and stitched to an in-memory NumPy array and plotted using Matplotlib.
image = socal['COH'].loc['winter', 'vv', 12]
assert image[100, 100] == 53
xarray.plot.imshow(image, size=6, aspect=1.8)
matplotlib.pyplot.show()
System information¶
Print information about the software used to run this script.
def system_info() -> str:
"""Print information about Python environment."""
import datetime
import sys
import fsspec
import imagecodecs
import kerchunk
import matplotlib
import numcodecs
import numpy
import tifffile
import xarray
import zarr
return '\n'.join(
(
sys.executable,
f'Python {sys.version}',
'',
f'numpy {numpy.__version__}',
f'matplotlib {matplotlib.__version__}',
f'tifffile {tifffile.__version__}',
f'imagecodecs {imagecodecs.__version__}',
f'numcodecs {numcodecs.__version__}',
f'fsspec {fsspec.__version__}',
f'kerchunk {kerchunk.__version__}',
f'xarray {xarray.__version__}',
f'zarr {zarr.__version__}',
'',
str(datetime.datetime.now()),
)
)
print(system_info())
X:\Python313\python.exe Python 3.13.3 (tags/v3.13.3:6280bb5, Apr 8 2025, 14:47:33) [MSC v.1943 64 bit (AMD64)] numpy 2.2.6 matplotlib 3.10.3 tifffile 2025.5.21 imagecodecs 2025.3.30 numcodecs 0.15.1 fsspec 2025.5.0 kerchunk 0.2.8 xarray 2025.4.0 zarr 3.0.8 2025-05-21 19:59:18.530575