Create a fsspec ReferenceFileSystem for a large set of remote GeoTIFF files¶

by Christoph Gohlke

Published Oct 9, 2021. Last updated May 21, 2025.

This Python script uses the tifffile and imagecodecs packages to create a fsspec ReferenceFileSystem file in JSON format for the Earthbigdata set, which consists of 1,033,422 GeoTIFF files stored on AWS. The ReferenceFileSystem is used to create a multi-dimensional Xarray dataset.

Refer to the discussion at kerchunk/issues/78.

Source code is available in the tifffile repository.

In [1]:
import base64
import os

import imagecodecs.numcodecs
import kerchunk.utils
import matplotlib.pyplot
import tifffile
import tifffile.zarr
import xarray
import zarr  # >= 3

Get a list of all remote TIFF files¶

Call the AWS command line app to recursively list all files in the Earthbigdata set. Cache the output in a local file. Filter the list for TIFF files and remove the common path.

In [2]:
if not os.path.exists('earthbigdata.txt'):
    os.system(
        'aws s3 ls sentinel-1-global-coherence-earthbigdata/data/tiles'
        ' --recursive > earthbigdata.txt'
    )

with open('earthbigdata.txt', encoding='utf-8') as fh:
    tiff_files = [
        line.split()[-1][11:] for line in fh.readlines() if '.tif' in line
    ]
print('Number of TIFF files:', len(tiff_files))
Number of TIFF files: 1033422

Define metadata to describe the dataset¶

Define labels, coordinate arrays, file name regular expression patterns, and categories for all dimensions in the Earthbigdata set.

In [3]:
baseurl = (
    'https://'
    'sentinel-1-global-coherence-earthbigdata.s3.us-west-2.amazonaws.com'
    '/data/tiles/'
)

chunkshape = (1200, 1200)
fillvalue = 0

latitude_label = 'latitude'
latitude_pattern = rf'(?P<{latitude_label}>[NS]\d+)'
latitude_coordinates = [
    (j * -0.00083333333 - 0.000416666665 + i)
    for i in range(82, -79, -1)
    for j in range(1200)
]
latitude_category = {}
i = 0
for j in range(82, -1, -1):
    latitude_category[f'N{j:-02}'] = i
    i += 1
for j in range(1, 79):
    latitude_category[f'S{j:-02}'] = i
    i += 1

longitude_label = 'longitude'
longitude_pattern = rf'(?P<{longitude_label}>[EW]\d+)'
longitude_coordinates = [
    (j * 0.00083333333 + 0.000416666665 + i)
    for i in range(-180, 180)
    for j in range(1200)
]
longitude_category = {}
i = 0
for j in range(180, 0, -1):
    longitude_category[f'W{j:-03}'] = i
    i += 1
for j in range(180):
    longitude_category[f'E{j:-03}'] = i
    i += 1

season_label = 'season'
season_category = {'winter': 0, 'spring': 1, 'summer': 2, 'fall': 3}
season_coordinates = list(season_category.keys())
season_pattern = rf'_(?P<{season_label}>{"|".join(season_category)})'

polarization_label = 'polarization'
polarization_category = {'vv': 0, 'vh': 1, 'hv': 2, 'hh': 3}
polarization_coordinates = list(polarization_category.keys())
polarization_pattern = (
    rf'_(?P<{polarization_label}>{"|".join(polarization_category)})'
)

coherence_label = 'coherence'
coherence_category = {
    '06': 0,
    '12': 1,
    '18': 2,
    '24': 3,
    '36': 4,
    '48': 5,
}
coherence_coordinates = list(int(i) for i in coherence_category.keys())
coherence_pattern = (
    rf'_COH(?P<{coherence_label}>{"|".join(coherence_category)})'
)

orbit_label = 'orbit'
orbit_coordinates = list(range(1, 176))
orbit_pattern = rf'_(?P<{orbit_label}>\d+)'

flightdirection_label = 'flightdirection'
flightdirection_category = {'A': 0, 'D': 1}
flightdirection_coordinates = list(flightdirection_category.keys())
flightdirection_pattern = (
    rf'(?P<{flightdirection_label}>[{"|".join(flightdirection_category)}])_'
)

Open a file for writing the fsspec ReferenceFileSystem in JSON format¶

In [4]:
jsonfile = open('earthbigdata.json', 'w', encoding='utf-8', newline='\n')

Write the coordinate arrays¶

Add the coordinate arrays to a Zarr group, convert it to a fsspec ReferenceFileSystem JSON string, and write it to the open file.

In [5]:
coordinates = {}  # type: ignore[var-annotated]
zarrgroup = zarr.open_group(coordinates, use_consolidated=False, zarr_format=2)
zararray = zarrgroup.create_array(
    longitude_label,
    shape=[len(longitude_coordinates)],
    dtype='float32',
    # compressor='zlib',
)
zararray[:] = longitude_coordinates
zararray.attrs['_ARRAY_DIMENSIONS'] = [longitude_label]

zararray = zarrgroup.create_array(
    latitude_label,
    shape=[len(latitude_coordinates)],
    dtype='float32',
    # compressor='zlib',
)
zararray[:] = latitude_coordinates
zararray.attrs['_ARRAY_DIMENSIONS'] = [latitude_label]

zararray = zarrgroup.create_array(
    season_label,
    shape=[len(season_coordinates)],
    dtype='|U6',
    compressor=None,
)
zararray[:] = season_coordinates
zararray.attrs['_ARRAY_DIMENSIONS'] = [season_label]

zararray = zarrgroup.create_array(
    polarization_label,
    shape=[len(polarization_coordinates)],
    dtype='|U2',
    compressor=None,
)
zararray[:] = polarization_coordinates
zararray.attrs['_ARRAY_DIMENSIONS'] = [polarization_label]

zararray = zarrgroup.create_array(
    coherence_label,
    shape=[len(coherence_coordinates)],
    dtype='uint8',
    compressor=None,
)
zararray[:] = coherence_coordinates
zararray.attrs['_ARRAY_DIMENSIONS'] = [coherence_label]

zararray = zarrgroup.create_array(
    orbit_label, shape=[len(orbit_coordinates)], dtype='int32'
)
zararray[:] = orbit_coordinates
zararray.attrs['_ARRAY_DIMENSIONS'] = [orbit_label]

zararray = zarrgroup.create_array(
    flightdirection_label,
    shape=[len(flightdirection_coordinates)],
    dtype='|U1',
    compressor=None,
)
zararray[:] = flightdirection_coordinates
zararray.attrs['_ARRAY_DIMENSIONS'] = [flightdirection_label]

# base64 encode any values containing non-ascii characters
for k, v in coordinates.items():
    v = v.to_bytes()
    try:
        coordinates[k] = v.decode()
    except UnicodeDecodeError:
        coordinates[k] = 'base64:' + base64.b64encode(v).decode()

coordinates_json = tifffile.zarr._json_dumps(coordinates).decode()

jsonfile.write(coordinates_json[:-2])  # skip the last newline and brace
Out[5]:
69746

Create a TiffSequence from a list of file names¶

Filter the list of GeoTIFF files for files containing coherence 'COH' data. The regular expression pattern and categories are used to parse the file names for chunk indices.

Note: the created TiffSequence cannot be used to access any files. The file names do not refer to existing files. The baseurl is later used to get the real location of the files.

In [6]:
mode = 'COH'
fileseq = tifffile.TiffSequence(
    [file for file in tiff_files if '_' + mode in file],
    pattern=(
        latitude_pattern
        + longitude_pattern
        + season_pattern
        + polarization_pattern
        + coherence_pattern
    ),
    categories={
        latitude_label: latitude_category,
        longitude_label: longitude_category,
        season_label: season_category,
        polarization_label: polarization_category,
        coherence_label: coherence_category,
    },
)
assert len(fileseq) == 444821
assert fileseq.files_missing == 5119339
assert fileseq.shape == (161, 360, 4, 4, 6)
assert fileseq.dims == (
    'latitude',
    'longitude',
    'season',
    'polarization',
    'coherence',
)
print(fileseq)
TiffSequence
 N00E005_fall_vv_COH12.tif
 files: 444821 (5119339 missing)
 shape: 161, 360, 4, 4, 6
 dims: latitude, longitude, season, polarization, coherence

Create a ZarrTiffStore from the TiffSequence¶

Define axestiled to tile the latitude and longitude dimensions of the TiffSequence with the first and second image/chunk dimensions. Define extra zattrs to create a Xarray compatible store.

In [7]:
store = fileseq.aszarr(
    chunkdtype='uint8',
    chunkshape=chunkshape,
    fillvalue=fillvalue,
    axestiled={0: 0, 1: 1},
    zattrs={
        '_ARRAY_DIMENSIONS': [
            season_label,
            polarization_label,
            coherence_label,
            latitude_label,
            longitude_label,
        ]
    },
)
print(store)
ZarrFileSequenceStore

Append the ZarrTiffStore to the open ReferenceFileSystem file¶

Use the mode name to create a Zarr subgroup. Use the imagecodecs_tiff Numcodecs compatible codec for decoding TIFF files.

In [8]:
store.write_fsspec(
    jsonfile,
    baseurl,
    groupname=mode,
    codec_id='imagecodecs_tiff',
    _append=True,
    _close=False,
)

Repeat for the other modes¶

Repeat the TiffSequence->aszarr->write_fsspec workflow for the other modes.

In [9]:
for mode in (
    'AMP',
    'tau',
    'rmse',
    'rho',
):
    fileseq = tifffile.TiffSequence(
        [file for file in tiff_files if '_' + mode in file],
        pattern=(
            latitude_pattern
            + longitude_pattern
            + season_pattern
            + polarization_pattern
        ),
        categories={
            latitude_label: latitude_category,
            longitude_label: longitude_category,
            season_label: season_category,
            polarization_label: polarization_category,
        },
    )
    print(fileseq)
    with fileseq.aszarr(
        chunkdtype='uint16',
        chunkshape=chunkshape,
        fillvalue=fillvalue,
        axestiled={0: 0, 1: 1},
        zattrs={
            '_ARRAY_DIMENSIONS': [
                season_label,
                polarization_label,
                latitude_label,
                longitude_label,
            ]
        },
    ) as store:
        print(store)
        store.write_fsspec(
            jsonfile,
            baseurl,
            groupname=mode,
            codec_id='imagecodecs_tiff',
            _append=True,
            _close=False,
        )


for mode in ('inc', 'lsmap'):
    fileseq = tifffile.TiffSequence(
        [file for file in tiff_files if '_' + mode in file],
        pattern=(
            latitude_pattern
            + longitude_pattern
            + orbit_pattern
            + flightdirection_pattern
        ),
        categories={
            latitude_label: latitude_category,
            longitude_label: longitude_category,
            # orbit has no category
            flightdirection_label: flightdirection_category,
        },
    )
    print(fileseq)
    with fileseq.aszarr(
        chunkdtype='uint8',
        chunkshape=chunkshape,
        fillvalue=fillvalue,
        axestiled={0: 0, 1: 1},
        zattrs={
            '_ARRAY_DIMENSIONS': [
                orbit_label,
                flightdirection_label,
                latitude_label,
                longitude_label,
            ]
        },
    ) as store:
        print(store)
        store.write_fsspec(
            jsonfile,
            baseurl,
            groupname=mode,
            codec_id='imagecodecs_tiff',
            _append=True,
            _close=mode == 'lsmap',  # close after last store
        )
TiffSequence
 N00E005_fall_vh_AMP.tif
 files: 187693 (739667 missing)
 shape: 161, 360, 4, 4
 dims: latitude, longitude, season, polarization
ZarrFileSequenceStore
TiffSequence
 N00E005_fall_vv_tau.tif
 files: 98331 (829029 missing)
 shape: 161, 360, 4, 4
 dims: latitude, longitude, season, polarization
ZarrFileSequenceStore
TiffSequence
 N00E005_fall_vv_rmse.tif
 files: 98331 (829029 missing)
 shape: 161, 360, 4, 4
 dims: latitude, longitude, season, polarization
ZarrFileSequenceStore
TiffSequence
 N00E005_fall_vv_rho.tif
 files: 98331 (829029 missing)
 shape: 161, 360, 4, 4
 dims: latitude, longitude, season, polarization
ZarrFileSequenceStore
TiffSequence
 N00E005_124D_inc.tif
 files: 52956 (20233044 missing)
 shape: 161, 360, 175, 2
 dims: latitude, longitude, orbit, flightdirection
ZarrFileSequenceStore
TiffSequence
 N00E005_124D_lsmap.tif
 files: 52959 (20233041 missing)
 shape: 161, 360, 175, 2
 dims: latitude, longitude, orbit, flightdirection
ZarrFileSequenceStore

Close the JSON file¶

In [10]:
jsonfile.close()

Use the fsspec ReferenceFileSystem file to create a Xarray dataset¶

Register imagecodecs.numcodecs before using the ReferenceFileSystem.

In [11]:
imagecodecs.numcodecs.register_codecs()

Create a Xarray dataset from the ReferenceFileSystem file¶

Use mask_and_scale to disable conversion to floating point.

In [12]:
store = kerchunk.utils.refs_as_store('earthbigdata.json')
dataset = xarray.open_zarr(store, consolidated=False, mask_and_scale=False)

print(dataset)
<xarray.Dataset> Size: 77TB
Dimensions:          (coherence: 6, flightdirection: 2, latitude: 193200,
                      longitude: 432000, orbit: 175, polarization: 4, season: 4)
Coordinates:
  * coherence        (coherence) uint8 6B 6 12 18 24 36 48
  * flightdirection  (flightdirection) <U1 8B 'A' 'D'
  * latitude         (latitude) float32 773kB 82.0 82.0 82.0 ... -79.0 -79.0
  * longitude        (longitude) float32 2MB -180.0 -180.0 ... 180.0 180.0
  * orbit            (orbit) int32 700B 1 2 3 4 5 6 ... 170 171 172 173 174 175
  * polarization     (polarization) <U2 32B 'vv' 'vh' 'hv' 'hh'
  * season           (season) <U6 96B 'winter' 'spring' 'summer' 'fall'
Data variables:
    COH              (season, polarization, coherence, latitude, longitude) uint8 8TB dask.array<chunksize=(1, 1, 1, 1200, 1200), meta=np.ndarray>
    AMP              (season, polarization, latitude, longitude) uint16 3TB dask.array<chunksize=(1, 1, 1200, 1200), meta=np.ndarray>
    tau              (season, polarization, latitude, longitude) uint16 3TB dask.array<chunksize=(1, 1, 1200, 1200), meta=np.ndarray>
    rmse             (season, polarization, latitude, longitude) uint16 3TB dask.array<chunksize=(1, 1, 1200, 1200), meta=np.ndarray>
    rho              (season, polarization, latitude, longitude) uint16 3TB dask.array<chunksize=(1, 1, 1200, 1200), meta=np.ndarray>
    inc              (orbit, flightdirection, latitude, longitude) uint8 29TB dask.array<chunksize=(1, 1, 1200, 1200), meta=np.ndarray>
    lsmap            (orbit, flightdirection, latitude, longitude) uint8 29TB dask.array<chunksize=(1, 1, 1200, 1200), meta=np.ndarray>

Select the Southern California region in the dataset¶

In [13]:
socal = dataset.sel(latitude=slice(36, 32.5), longitude=slice(-121, -115))
print(socal)
<xarray.Dataset> Size: 28GB
Dimensions:          (coherence: 6, flightdirection: 2, latitude: 4200,
                      longitude: 7200, orbit: 175, polarization: 4, season: 4)
Coordinates:
  * coherence        (coherence) uint8 6B 6 12 18 24 36 48
  * flightdirection  (flightdirection) <U1 8B 'A' 'D'
  * latitude         (latitude) float32 17kB 36.0 36.0 36.0 ... 32.5 32.5 32.5
  * longitude        (longitude) float32 29kB -121.0 -121.0 ... -115.0 -115.0
  * orbit            (orbit) int32 700B 1 2 3 4 5 6 ... 170 171 172 173 174 175
  * polarization     (polarization) <U2 32B 'vv' 'vh' 'hv' 'hh'
  * season           (season) <U6 96B 'winter' 'spring' 'summer' 'fall'
Data variables:
    COH              (season, polarization, coherence, latitude, longitude) uint8 3GB dask.array<chunksize=(1, 1, 1, 1200, 1200), meta=np.ndarray>
    AMP              (season, polarization, latitude, longitude) uint16 968MB dask.array<chunksize=(1, 1, 1200, 1200), meta=np.ndarray>
    tau              (season, polarization, latitude, longitude) uint16 968MB dask.array<chunksize=(1, 1, 1200, 1200), meta=np.ndarray>
    rmse             (season, polarization, latitude, longitude) uint16 968MB dask.array<chunksize=(1, 1, 1200, 1200), meta=np.ndarray>
    rho              (season, polarization, latitude, longitude) uint16 968MB dask.array<chunksize=(1, 1, 1200, 1200), meta=np.ndarray>
    inc              (orbit, flightdirection, latitude, longitude) uint8 11GB dask.array<chunksize=(1, 1, 1200, 1200), meta=np.ndarray>
    lsmap            (orbit, flightdirection, latitude, longitude) uint8 11GB dask.array<chunksize=(1, 1, 1200, 1200), meta=np.ndarray>

Plot a selection of the dataset¶

The few GeoTIFF files comprising the selection are transparently downloaded, decoded, and stitched to an in-memory NumPy array and plotted using Matplotlib.

In [14]:
image = socal['COH'].loc['winter', 'vv', 12]
assert image[100, 100] == 53
xarray.plot.imshow(image, size=6, aspect=1.8)
matplotlib.pyplot.show()
No description has been provided for this image

System information¶

Print information about the software used to run this script.

In [15]:
def system_info() -> str:
    """Print information about Python environment."""
    import datetime
    import sys

    import fsspec
    import imagecodecs
    import kerchunk
    import matplotlib
    import numcodecs
    import numpy
    import tifffile
    import xarray
    import zarr

    return '\n'.join(
        (
            sys.executable,
            f'Python {sys.version}',
            '',
            f'numpy {numpy.__version__}',
            f'matplotlib {matplotlib.__version__}',
            f'tifffile {tifffile.__version__}',
            f'imagecodecs {imagecodecs.__version__}',
            f'numcodecs {numcodecs.__version__}',
            f'fsspec {fsspec.__version__}',
            f'kerchunk {kerchunk.__version__}',
            f'xarray {xarray.__version__}',
            f'zarr {zarr.__version__}',
            '',
            str(datetime.datetime.now()),
        )
    )


print(system_info())
X:\Python313\python.exe
Python 3.13.3 (tags/v3.13.3:6280bb5, Apr  8 2025, 14:47:33) [MSC v.1943 64 bit (AMD64)]

numpy 2.2.6
matplotlib 3.10.3
tifffile 2025.5.21
imagecodecs 2025.3.30
numcodecs 0.15.1
fsspec 2025.5.0
kerchunk 0.2.8
xarray 2025.4.0
zarr 3.0.8

2025-05-21 19:59:18.530575