# import common modules
import datetime
import math
import multiprocessing
import os
import random
import sys
import threading
import warnings

# limit the number CPUs to use
MAXCPUS = min(8, multiprocessing.cpu_count() // 2)

# read-only directory where the example data were extracted
DATA_PATH = './'

# writable directory where large intermediate and output files will be saved
# must not be a network drive
SCRATCH_PATH = './'

# for the BigDIPA workshop cluster
if os.path.exists('../../data/02_fcs_computation/'):
    DATA_PATH = '../../data/02_fcs_computation/'
    SCRATCH_PATH = '../../scratch/'

# use sequential MKL and OpenBLAS to prevent thread oversubscription
os.environ['MKL_NUM_THREADS'] = '1'
os.environ['OPENBLAS_NUM_THREADS'] = '1'
# os.environ['OMP_NUM_THREADS'] = '1'

import numpy

# set compiler and linker arguments for OpenMP
if sys.platform == 'win32':
    OPENMP_ARGS = '--compile-args=/openmp'
else:
    OPENMP_ARGS = '--compile-args=-fopenmp --link-args=-fopenmp'

# tell numba where to find CUDA NVVM on Windows
cuda_path = os.environ.get('CUDA_PATH', 'CUDA not found')
if os.path.exists(cuda_path):
    os.environ['PATH'] += r';%s\bin;%s\nvvm\bin' % (cuda_path, cuda_path)
    os.environ['CUDA_HOME'] = cuda_path

# ignore warnings
warnings.simplefilter('ignore')

# acquire a lock object to force single threaded execution
THREADLOCK = threading.RLock()

# initialize random number generators
random.seed(42)
numpy.random.seed(42)

# display plots within Jupyter notebook
%matplotlib inline

# detect if CUDA is available
try:
    import cupy

    SKIP_CUDA = False
except ImportError:
    SKIP_CUDA = True

# record the current time
START_TIME = datetime.datetime.now()

def ipcf_reference(image_timeseries, circle_coordinates, bins):
    """Return pair correlation function analysis of image time series.

    Cross-correlate the time series of each pixel in the image
    with all its neighbors at a certain radius and return all
    the log-binned and smoothed correlation curves.

    """
    ntimes, height, width = image_timeseries.shape
    npoints = len(circle_coordinates)
    radius = circle_coordinates[0, 0]

    result = zeros(
        (height - 2 * radius, width - 2 * radius, npoints, len(bins)),
        numpy.float32,
    )

    for y in range(radius, height - radius):
        for x in range(radius, width - radius):
            a = image_timeseries[:, y, x]
            for i in range(npoints):
                u, v = circle_coordinates[i]
                b = image_timeseries[:, y + v, x + u]
                c = correlate(b, a)
                result[y - radius, x - radius, i] = smooth(average(c, bins))

    return result


# functions that need to be implemented


def correlate(a, b):
    """Return cross-correlation of two arrays."""
    ...


def average(c, bins):
    """Return averaged chunks of array."""
    ...


def smooth(c):
    """Return smoothed array."""
    ...


def circle(npoints, radius):
    """Return circle coordinates."""
    ...


def logbins(size, nbins):
    """Return up to nbins exponentially increasing integers from 1 to size."""
    ...

a = [1, 2]
b = [3, 4]

c = [
    0 * b[0] + a[1 - 1] * b[1],  # delay -1  a[-1]=0
    a[0 + 0] * b[0] + a[1 + 0] * b[1],
]  # delay 0

print(c)

[4, 11]

a = [1, 2]
b = [3, 4]

c = [
    a[0 - 1] * b[0] + a[1 - 1] * b[1],  # delay -1 a[-1]=a[1]
    a[0 + 0] * b[0] + a[1 + 0] * b[1],
]  # delay 0

print(c)

[10, 11]

a = [0, 1, 2, 0]
b = [0, 3, 4, 0]

c = [
    a[0 - 1] * b[0] + a[1 - 1] * b[1] + a[2 - 1] * b[2] + a[3 - 1] * b[3],
    a[0 + 0] * b[0] + a[1 + 0] * b[1] + a[2 + 0] * b[2] + a[3 + 0] * b[3],
]

print(c)

[4, 11]

def dot_python(a, b, start, stop, delay):
    """Return dot product of two sequences in range."""
    sum = 0
    for n in range(start, stop):
        sum += a[n + delay] * b[n]
    return sum


def correlate_python(a, b):
    """Return linear correlation of two sequences."""
    size = len(a)

    c = [0] * size  # allocate output array/list

    for index in range(size):
        delay = index - size // 2
        if delay < 0:
            c[index] = dot_python(a, b, -delay, size, delay)
        else:
            c[index] = dot_python(a, b, 0, size - delay, delay)

    return c

def test_correlate(correlate_function):
    """Test linear correlate function using known result."""
    # even lengths
    c = correlate_function([1, 2], [3, 4])
    assert list(c) == [4, 11], c

    # uneven lengths
    c = correlate_function([1, 2, 3], [4, 8, 16])
    assert list(c) == [40, 68, 32], c


test_correlate(correlate_python)

import random

A = [random.random() - 0.5 for _ in range(2**13)]
B = [random.random() - 0.5 for _ in range(2**13)]

%time c = correlate_python(A, B)

CPU times: total: 1.09 s
Wall time: 1.57 s

from matplotlib import pyplot


def plot_autocorrelation(size=200):
    """Plot autocorrelation of a random sequence."""
    a = [random.random() - 0.5 for _ in range(size)]
    c = correlate_python(a, a)
    delays = list(range(-len(a) // 2, len(a) // 2))

    pyplot.figure(figsize=(6, 6))
    pyplot.subplot(2, 1, 1)
    pyplot.title('random sequence')
    pyplot.ylabel('intensity')
    pyplot.plot(a, 'g')

    pyplot.subplot(2, 1, 2)
    pyplot.title('auto-correlation')
    pyplot.xlabel('delay')
    pyplot.ylabel('correlation')
    pyplot.plot(delays, c, 'r')

    pyplot.tight_layout()
    pyplot.show()


plot_autocorrelation()

from ipywidgets import Dropdown, IntSlider, interact
from matplotlib import pyplot


def plot_crosscorrelation(size=100):
    """Interactively plot cross-correlation of signals with delayed peak."""
    delays = list(range(-size // 2, size // 2))
    a = [random.random() - 0.5 for _ in range(size)]
    b = [random.random() - 0.5 for _ in range(size)]

    a[size // 2] = 10  # add peak in middle of sequence

    def _plot(option, delay):
        b_ = b.copy()
        b_[size // 2 + delay] = 10  # add peak at shifted position

        if option.endswith('b'):
            c = correlate_python(a, b_)
        else:
            c = correlate_python(b_, a)

        pyplot.figure(figsize=(6, 6))
        pyplot.subplot(2, 1, 1)
        pyplot.title('random sequences with peak')
        pyplot.ylabel('intensity')
        pyplot.plot(a, 'g', label='a')
        pyplot.plot(b_, 'b', label='b')
        pyplot.ylim([-2, 12])
        pyplot.yticks([0, 5, 10])
        pyplot.legend(fancybox=True, framealpha=0.5)

        pyplot.subplot(2, 1, 2)
        pyplot.title('cross-correlation')
        pyplot.xlabel('delay')
        pyplot.ylabel('correlation')
        pyplot.xlim([-size // 2, size // 2])
        pyplot.ylim([-20, 120])
        pyplot.yticks([0, 50, 100])
        pyplot.plot(delays, c, 'r', label=option)
        pyplot.legend(fancybox=True, framealpha=0.5)

        pyplot.tight_layout()
        pyplot.show()

    interact(
        _plot,
        option=Dropdown(options=['a\u2605b', 'b\u2605a']),
        delay=IntSlider(
            value=size // 5,
            min=2 - size // 2,
            max=size // 2 - 1,
            continuous_update=False,
        ),
    )


plot_crosscorrelation()

from concurrent.futures import ThreadPoolExecutor
from functools import partial


def map_threaded(function, *iterables, max_workers=MAXCPUS, **kwargs):
    """Apply function to every item of iterable and return list of results.

    Use a pool of threads to execute calls asynchronously.

    """
    if kwargs:
        function = partial(function, **kwargs)
    with ThreadPoolExecutor(max_workers) as executor:
        return list(executor.map(function, *iterables))

%time c = map_threaded(correlate_python, [A, A], [B, B])

assert c[0] == c[1]

CPU times: total: 859 ms
Wall time: 3.13 s

%%writefile correlate_c.c

/* A linear correlate function implemented in C. */

#include <stddef.h>
#include <stdlib.h>
#include <stdio.h>

typedef ptrdiff_t ssize_t;

/* Compute dot product of two sequences in range. */
double dot_c(double *a, double *b, ssize_t start, ssize_t end, ssize_t delay)
{
    ssize_t n;
    double sum = 0.0;
    for (n = start; n < end; n++)
        sum += a[n + delay] * b[n];
    return sum;
}


/* Compute linear correlation of two one-dimensional sequences. */
void correlate_c(double *a, double *b, double *c, ssize_t size)
{
    ssize_t index, delay;

    for(index = 0; index < size; index++) {
        delay = index - size / 2;
        if (delay < 0) {
            c[index] = dot_c(a, b, -delay, size, delay);
        }
        else {
            c[index] = dot_c(a, b, 0, size-delay, delay);
        }
    }
}


/* Time the correlate_c function. */
int main()
{
    ssize_t i;
    ssize_t size = 8192;
    ssize_t loops = 25;

    double *a = (double*)malloc(size * sizeof(double));
    double *b = (double*)malloc(size * sizeof(double));
    for (i = 0; i < size; i++) {
        a[i] = (double)rand()/(double)(RAND_MAX) - 0.5;
        b[i] = (double)rand()/(double)(RAND_MAX) - 0.5;
    }

    for (i = 0; i < loops; i++) {
        double *c = (double*)malloc(size * sizeof(double));
        correlate_c(a, b, c, size);
        free(c);
    }

    free(a);
    free(b);

    return 0;
}

Writing correlate_c.c

from distutils import ccompiler

compiler = ccompiler.new_compiler()
objects = compiler.compile(['correlate_c.c'], extra_postargs=['-O2'])
compiler.link_executable(objects, 'correlate_c')

correlate_executable = './correlate_c'
t = %timeit -r 1 -q -o ! $correlate_executable

print('{:.2f} ms per loop'.format(t.best * 1000 / 25))

18.74 ms per loop

import random
import sys

size = 8192
alist = [random.random() for _ in range(size)]

print(
    'Storage size of Python list: {:>6} bytes'.format(
        sys.getsizeof(alist) + sys.getsizeof(alist[0]) * size
    )
)

print('Storage size of C array:     {:>6} bytes'.format(8 + size * 8))

Storage size of Python list: 263832 bytes
Storage size of C array:      65544 bytes

import numpy


def correlate_numpy(a, b):
    """Return linear correlation of two one-dimensional arrays."""
    return numpy.correlate(a, b, mode='same')


def test_correlate(correlate_function, **kwargs):
    """Test correlate function using known results."""
    c = correlate_function(
        numpy.array([1.0, 2.0, 3.0]), numpy.array([4.0, 8.0, 16.0]), **kwargs
    )
    assert numpy.allclose(c, [40.0, 68.0, 32.0]), c

    c = correlate_function(
        numpy.array([1.0, 2.0, 3.0, 4.0]),
        numpy.array([5.0, 6.0, 7.0, 8.0]),
        **kwargs,
    )
    assert numpy.allclose(c, [23.0, 44.0, 70.0, 56.0]), c


test_correlate(correlate_numpy)

A = numpy.random.random(2**13) - 0.5
B = numpy.random.random(2**13) - 0.5

%timeit correlate_numpy(A, B)

5.18 ms ± 116 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

print('Storage size of numpy array: {} bytes'.format(sys.getsizeof(A)))

Storage size of numpy array: 65648 bytes

%timeit map_threaded(correlate_numpy, [A, A], [B, B])

5.3 ms ± 57.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

%reload_ext Cython

%%cython -f --compile-args=-O2
#
#cython: boundscheck=False
#cython: wraparound=False

import numpy


cdef double dot_cython(
    double[::1] a, 
    double[::1] b,
    ssize_t start, 
    ssize_t end, 
    ssize_t delay
) noexcept nogil:
    """Return dot product of two sequences in range."""
    cdef:
        ssize_t n
        double sum

    sum = 0.0
    for n in range(start, end):
        sum += a[n + delay] * b[n]
    return sum


def correlate_cython(double[::1] a not None, double[::1] b not None):
    """Return linear correlation of two one-dimensional arrays."""
    cdef ssize_t size, delay, index

    size = len(a)
    result = numpy.empty(size, dtype=numpy.float64)

    # numpy array objects cannot be accessed in a nogil section
    # use a Cython typed memoryview instead
    cdef double[::1] c = result

    with nogil:
        for index in range(size):
            delay = index - size // 2
            if delay < 0:
                c[index] = dot_cython(a, b, -delay, size, delay)
            else:
                c[index] = dot_cython(a, b, 0, size-delay, delay)

    return result

test_correlate(correlate_cython)

%timeit correlate_cython(A, B)
%timeit map_threaded(correlate_cython, [A, A], [B, B])

18.1 ms ± 29.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
18.5 ms ± 18.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

%%cython -f --compile-args=-O2  $OPENMP_ARGS
#
#cython: boundscheck=False
#cython: wraparound=False

import numpy
from cython.parallel import prange, parallel


cdef double dot_cython(
    double[::1] a, 
    double[::1] b,
    ssize_t start,
    ssize_t end, 
    ssize_t delay
) noexcept nogil:
    """Return dot product of two sequences in range."""
    cdef:
        ssize_t n
        double sum

    sum = 0.0
    for n in range(start, end):
        sum += a[n + delay] * b[n]
    return sum


def correlate_cython_omp(
    double[::1] a not None, 
    double[::1] b not None,
    int num_threads=0
):
    """Return linear correlation of two one-dimensional arrays."""
    cdef:
        ssize_t size, delay, index
        double[::1] c

    size = a.size
    result = numpy.empty(size, dtype=numpy.float64)
    c = result

    with nogil, parallel(num_threads=num_threads):
        for index in prange(size):
            delay = index - size // 2
            if delay < 0:
                c[index] = dot_cython(a, b, -delay, size, delay)
            else:
                c[index] = dot_cython(a, b, 0, size-delay, delay)

    return result

test_correlate(correlate_cython_omp)

%timeit correlate_cython_omp(A, B)
%timeit map_threaded(correlate_cython_omp, [A, A], [B, B])

1.65 ms ± 7.93 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
5.74 ms ± 46.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

import numba
import numpy


@numba.jit(nogil=True)
def dot_numba(a, b, start, stop, delay):
    """Return dot product of two sequences in range."""
    sum = 0.0
    for n in range(start, stop):
        sum += a[n + delay] * b[n]
    return sum


@numba.jit
def correlate_numba(a, b):
    """Return linear correlation of two one-dimensional arrays."""
    size = len(a)

    c = numpy.empty(size, numpy.float64)  # allocate output numpy array

    for index in range(size):
        delay = index - size // 2
        if delay < 0:
            c[index] = dot_numba(a, b, -delay, size, delay)
        else:
            c[index] = dot_numba(a, b, 0, size - delay, delay)

    return c


test_correlate(correlate_numba)

%timeit correlate_numba(A, B)
%timeit map_threaded(correlate_numba, [A, A], [B, B])

25.9 ms ± 359 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
53.6 ms ± 288 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

import numba
import numpy


@numba.jit(nogil=True)
def dot_numba(a, b, start, stop, delay):
    """Return dot product of two sequences in range."""
    sum = 0.0
    for n in range(start, stop):
        sum += a[n + delay] * b[n]
    return sum


@numba.jit(nogil=True, parallel=True)
def correlate_numba_jit(c, a, b, size):
    """Compute linear correlation of two arrays using sliding-dot product."""

    for index in numba.prange(size):
        delay = index - size // 2
        if delay < 0:
            c[index] = dot_numba(a, b, -delay, size, delay)
        else:
            c[index] = dot_numba(a, b, 0, size - delay, delay)


def correlate_numba_parallel(a, b):
    """Return linear correlation of two one-dimensional arrays."""
    size = len(a)
    c = numpy.empty(size, numpy.float64)
    with THREADLOCK:
        correlate_numba_jit(c, a, b, size)
    return c


test_correlate(correlate_numba_parallel)

%timeit correlate_numba_parallel(A, B)
%timeit map_threaded(correlate_numba_parallel, [A, A], [B, B])

2.44 ms ± 64.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
8.15 ms ± 37.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

import numpy

import numba.cuda


@numba.cuda.jit(device=True, inline=True)
def dot_cuda(a, b, start, stop, delay):
    """Return dot product of two sequences in range."""
    sum = 0.0
    for i in range(start, stop):
        sum += a[i + delay] * b[i]
    return sum


@numba.cuda.jit()
def correlate_cuda_kernel(c, a, b, size):
    """CUDA kernel to compute linear correlation of two arrays."""

    # global position of the thread in the 1D grid
    index = numba.cuda.grid(1)

    if index < size:
        delay = index - size // 2
        if delay < 0:
            c[index] = dot_cuda(a, b, -delay, size, delay)
        else:
            c[index] = dot_cuda(a, b, 0, size - delay, delay)


def correlate_numba_cuda(a, b):
    """Return linear correlation of two one-dimensional arrays."""
    size = a.size
    c = numpy.zeros(size, numpy.float64)

    # launch the CUDA kernel
    threadsperblock = 32
    blockspergrid = (size + (threadsperblock - 1)) // threadsperblock

    correlate_cuda_kernel[blockspergrid, threadsperblock](c, a, b, size)

    return c


if not SKIP_CUDA:
    test_correlate(correlate_numba_cuda)

    %timeit correlate_numba_cuda(A, B)
    # crashes: %timeit map_threaded(correlate_numba_cuda, [A, A], [B, B])

1.35 ms ± 25.4 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

import scipy.signal


def correlate_scipy(a, b):
    """Return circular correlation of two one-dimensional arrays."""
    return scipy.signal.fftconvolve(a, b[::-1], 'same')


test_correlate(correlate_scipy)

%timeit correlate_scipy(A, B)
%timeit map_threaded(correlate_scipy, [A, A], [B, B])

253 µs ± 10.4 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
655 µs ± 7.9 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

import numpy.fft


def correlate_fft(a, b):
    """Return circular correlation of two one-dimensional arrays."""
    # forward DFT
    a = numpy.fft.rfft(a)
    b = numpy.fft.rfft(b)
    # multiply by complex conjugate
    a *= b.conj()
    # reverse DFT
    c = numpy.fft.irfft(a)
    # shift
    c = numpy.fft.fftshift(c)
    return c


%timeit correlate_fft(A, B)
%timeit map_threaded(correlate_fft, [A, A], [B, B])

120 µs ± 2.76 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
459 µs ± 3.72 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

import numpy

c = correlate_fft(
    numpy.pad(A, A.size // 2, mode='constant'),
    numpy.pad(B, B.size // 2, mode='constant'),
)

# remove zero-padding from result
c = c[A.size // 2 : -A.size // 2]

# compare to linear correlation
assert numpy.allclose(c, correlate_numpy(A, B))

if not SKIP_CUDA:
    import cupy.fft


def correlate_cufft(a, b):
    """Return circular correlation of two one-dimensional arrays."""

    # move arrays to the current GPU device
    a = cupy.asarray(a)
    b = cupy.asarray(b)

    a = cupy.fft.rfft(a)
    b = cupy.fft.rfft(b)
    a *= b.conj()
    c = cupy.fft.irfft(a)
    c = cupy.fft.fftshift(c)

    # move array from GPU device to the host
    return cupy.asnumpy(c)


if not SKIP_CUDA:
    %timeit correlate_cufft(A, B)
    # very slow: %timeit map_threaded(correlate_cufft, [A, A], [B, B])

326 µs ± 2.48 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

from distutils import ccompiler

compiler = ccompiler.new_compiler()
objects = compiler.compile(['fft2d/fftsg.c'], extra_postargs=['-fPIC', '-O2'])
compiler.create_static_lib(objects, 'ftt2d', output_dir='.')

%%cython -f --compile-args=-O2 -I. -l./ftt2d
#
#cython: boundscheck=False
#cython: wraparound=False

import numpy

from libc.stdlib cimport malloc, free
from libc.math cimport sqrt

cdef extern from 'fft2d.h':
    void rdft(int n, int isgn, double *a, int *ip, double *w) nogil


def correlate_cython_fft2d(a, b):
    """Return circular correlation of two one-dimensional arrays."""
    cdef:
        ssize_t size = a.size
        double scale = 2.0 / size
        double[::1] a_
        double[::1] b_
        double *w_
        int *ip_
        int s

    # copy input arrays. rdft computes in-place
    result = numpy.copy(a)
    a_ = result
    b_ = numpy.copy(b)

    with nogil:

        # allocate cos/sin table
        w_ = <double *>malloc((size // 2) * sizeof(double))
        if not w_:
            with gil:
                raise MemoryError('could not allocate w_')

        # allocate work area for bit reversal
        ip_ = <int *>malloc((2 + <int>(sqrt((size//2) + 0.5))) * sizeof(int))
        if not ip_:
            with gil:
                raise MemoryError('could not allocate ip_')
        ip_[0] = 0

        # forward DFT
        rdft(size, 1, &b_[0], ip_, w_)
        rdft(size, 1, &a_[0], ip_, w_)

        # multiply by complex conjugate
        multiply_conj(a_, b_, size)

        # reverse DFT
        rdft(size, -1, &a_[0], ip_, w_)

        # shift and scale results
        fftshift(a_, size, scale)

        free(w_)
        free(ip_)

    return result


cdef void multiply_conj(
    double[::1] a, 
    double[::1] b, 
    ssize_t size
) noexcept nogil:
    """In-place multiply a by complex conjugate of b."""
    cdef:
        ssize_t i
        double ar, br, ai, bi

    a[0] = a[0] * b[0]
    a[1] = a[1] * b[1]
    for i in range(2, size, 2):
        ar = a[i]
        ai = a[i+1]
        br = b[i]
        bi = b[i+1]
        a[i] = ar * br + ai * bi
        a[i+1] = ai * br - ar * bi


cdef void fftshift(
    double[::1] a, 
    ssize_t size, 
    double scale
) noexcept nogil:
    """In-place shift zero-frequency component to center of spectrum."""
    cdef:
        ssize_t i
        double t

    size //= 2

    for i in range(size):
        t = a[i]
        a[i] = a[i + size] * scale
        a[i + size] = t * scale

assert numpy.allclose(correlate_cython_fft2d(A, B), correlate_fft(A, B))

%timeit correlate_cython_fft2d(A, B)
%timeit map_threaded(correlate_cython_fft2d, [A, A], [B, B])

58.4 µs ± 265 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
307 µs ± 1.39 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

import timeit

import numpy
from IPython.display import display
from ipywidgets import IntProgress


def time_functions(functions, size=2**14, max_workers=MAXCPUS):
    """Return runtimes of single and multi-threaded correlation functions."""
    progress = IntProgress(min=0, max=2 * len(functions))
    display(progress)

    a = numpy.random.random(size) - 0.5
    b = numpy.random.random(size) - 0.5
    ab = [a] * max_workers, [b] * max_workers

    result = []
    for function in functions:
        try:
            func = globals()[function]
            t0 = timeit.Timer(lambda: func(a, b)).timeit(number=1)
            number = max(2, int(1 / t0))
            t0 = timeit.Timer(lambda: func(a, b)).timeit(number)
            progress.value += 1
            if 'cu' not in function:
                t1 = timeit.Timer(
                    lambda: map_threaded(func, *ab, max_workers=max_workers)
                ).timeit(number)
            else:
                # do not run CUDA multi-threaded
                t1 = math.inf
            result.append(
                [
                    '{:.2f}'.format(t0 * 1e3 / number),
                    '{:.2f}'.format(t1 * 1e3 / number),
                    '{:.1f}'.format(t0 / t1 * max_workers),
                ]
            )
            progress.value += 1
        except Exception:
            result.append([float('nan')] * 3)
    progress.close()
    try:
        import pandas

        columns = [
            '1 thread / ms',
            '{} threads / ms'.format(max_workers),
            'speedup',
        ]
        return pandas.DataFrame(result, index=functions, columns=columns)
    except ImportError:
        return result


display(
    time_functions(
        [
            # 'correlate_python'
            'correlate_numpy',
            'correlate_cython',
            'correlate_cython_omp',
            'correlate_numba',
            'correlate_numba_parallel',
            'correlate_numba_cuda',
            'correlate_scipy',
            'correlate_fft',
            'correlate_cufft',
            'correlate_cython_fft2d',
        ]
    )
)

IntProgress(value=0, max=20)

import numpy


def rawread(filename, shape, dtype):
    """Return array data from binary file."""
    count = numpy.prod(shape, dtype=numpy.intp)
    count = count if count >= 0 else -1
    data = numpy.fromfile(filename, dtype=dtype, count=count)
    data.shape = shape
    return data


SIMULATION_DATA = rawread(
    os.path.join(DATA_PATH, 'Simulation_Channel.bin'),
    shape=(-1, 64, 64),
    dtype=numpy.uint16,
)

print('Data shape:', SIMULATION_DATA.shape)

Data shape: (32000, 64, 64)

def shape2pow2(data, axis):
    """Return array with axis truncated to power of 2."""
    try:
        iter(axis)
    except TypeError:
        axis = [axis]
    slices = []
    for i, size in enumerate(data.shape):
        if i in axis:
            size = 2 ** int(math.log(size, 2))
        slices.append(slice(0, size))
    return data[tuple(slices)]


SIMULATION_DATA = shape2pow2(SIMULATION_DATA, axis=0)

print('Truncated shape:', SIMULATION_DATA.shape)

Truncated shape: (16384, 64, 64)

from matplotlib import pyplot


def plot_image_timeseries(image_timeseries):
    """Plot temporal and spatial means of image timeseries."""
    pyplot.figure(figsize=(6, 8))

    pyplot.subplot(3, 1, (1, 2))
    mean_image = numpy.mean(image_timeseries, axis=0)
    pyplot.title('image of temporal mean')
    pyplot.imshow(mean_image, cmap='viridis', interpolation='none')
    pyplot.colorbar(shrink=0.83, pad=0.05)

    pyplot.subplot(3, 1, 3)
    mean_ts = numpy.mean(image_timeseries, axis=(1, 2))
    pyplot.title('time series of spatial mean')
    pyplot.xlabel('time index')
    pyplot.ylabel('intensity')
    pyplot.plot(mean_ts)
    pyplot.xlim([0, len(mean_ts)])

    pyplot.tight_layout()
    pyplot.show()


plot_image_timeseries(SIMULATION_DATA)

import numpy


def correlate_circular(a, b):
    """Return circular correlation of two arrays using DFT."""
    size = a.size

    # forward DFT
    a = numpy.fft.rfft(a)
    b = numpy.fft.rfft(b)
    # multiply by complex conjugate
    c = a.conj() * b
    # reverse DFT
    c = numpy.fft.irfft(c)

    # positive delays only
    c = c[: size // 2]

    # normalize with the averages of a and b
    #   c is already normalized by size
    #   the 0th value of the DFT contains the sum of the signal
    c /= a[0].real * b[0].real / size
    c -= 1.0

    return c


def correlate_linear(a, b):
    """Return linear correlation of two arrays using DFT."""
    size = a.size

    # subtract mean and pad with zeros to twice the size
    a_mean = a.mean()
    b_mean = b.mean()
    a = numpy.pad(a - a_mean, a.size // 2, mode='constant')
    b = numpy.pad(b - b_mean, b.size // 2, mode='constant')

    # forward DFT
    a = numpy.fft.rfft(a)
    b = numpy.fft.rfft(b)
    # multiply by complex conjugate
    c = a.conj() * b
    # reverse DFT
    c = numpy.fft.irfft(c)
    # positive delays only
    c = c[: size // 2]

    # normalize with the averages of a and b
    c /= size * a_mean * b_mean

    return c


def average(c, bins):
    """Return averaged chunks of array."""
    out = [numpy.mean(c[: bins[0]])]
    for i in range(len(bins) - 1):
        out.append(numpy.mean(c[bins[i] : bins[i + 1]]))
    return out


def logbins(size, nbins):
    """Return up to nbins exponentially increasing integers from 1 to size."""
    b = numpy.logspace(0, math.log(size, 2), nbins, base=2, endpoint=True)
    return numpy.unique(b.astype(numpy.intp))


def smooth(c):
    """Return double exponentially smoothed array."""
    out = c.copy()
    out[0] = out[1]
    for i in range(1, len(out)):
        out[i] = out[i] * 0.3 + out[i - 1] * 0.7
    for i in range(len(out) - 2, -1, -1):
        out[i] = out[i] * 0.3 + out[i + 1] * 0.7
    return out

from ipywidgets import IntSlider, interact
from matplotlib import pyplot


def plot_pcf_processing(image_timeseries):
    """Compare linear and circular pair correlation functions."""
    ntimes, height, width = image_timeseries.shape

    def _plot(y0, x0, y1, x1):
        # select time series from image_timeseries
        a = image_timeseries[:, y0, x0]
        b = image_timeseries[:, y1, x1]

        # linear and circular correlation
        cl = correlate_linear(a, b)
        cc = correlate_circular(a, b)

        # average and smooth
        bins = logbins(a.size // 2, 32)
        averagedl = average(cl, bins)
        smoothedl = smooth(averagedl)
        averagedc = average(cc, bins)
        smoothedc = smooth(averagedc)

        pyplot.figure(figsize=(6, 12))

        # plot the time series
        pyplot.subplot(4, 1, 1)
        pyplot.title('time series')
        pyplot.xlabel('time index')
        pyplot.ylabel('intensity')
        pyplot.plot(a, 'g', label='[{}, {}]'.format(y0, x0))
        pyplot.plot(b, 'b', label='[{}, {}]'.format(y1, x0))
        pyplot.xlim([0, len(a)])
        pyplot.legend(fancybox=True, framealpha=0.9)

        # plot the cross-correlation function and logbins
        pyplot.subplot(4, 1, 2)
        pyplot.title('normalized cross-correlation functions and logbins')
        pyplot.xlabel('positive time delay index')
        pyplot.ylabel('correlation')
        for x in bins:
            pyplot.axvline(x=x, color='0.8')
        pyplot.plot(cl, 'g', label='linear')
        pyplot.plot(cc, 'b', label='circular')
        pyplot.xlim([0, len(cc)])
        pyplot.legend(fancybox=True, framealpha=0.9)

        # log-plot the cross-correlation function and logbins
        pyplot.subplot(4, 1, 3)
        pyplot.title('log-plot of cross-correlation functions and logbins')
        pyplot.xlabel('positive time delay index')
        pyplot.ylabel('correlation')
        for x in bins:
            pyplot.axvline(x=x, color='0.8')
        pyplot.semilogx(cl, 'g', label='linear', base=2)
        pyplot.semilogx(cc, 'b', label='circular', base=2)
        pyplot.xlim([0, len(cc)])
        pyplot.legend(fancybox=True, framealpha=0.9)

        # plot the binned and smoothed cross-correlation function
        pyplot.subplot(4, 1, 4)
        pyplot.title('averaged and smoothed cross-correlation functions')
        pyplot.xlabel('positive log time delay index')
        pyplot.ylabel('correlation')
        pyplot.plot(averagedl, 'g', label='linear')
        pyplot.plot(smoothedl, 'm')
        pyplot.plot(averagedc, 'b', label='circular')
        pyplot.plot(smoothedc, 'r', label='smoothed')
        pyplot.legend(fancybox=True, framealpha=0.9)

        pyplot.tight_layout()
        pyplot.show()

    interact(
        _plot,
        y0=IntSlider(31, 0, height - 1, continuous_update=False),
        x0=IntSlider(31, 0, width - 1, continuous_update=False),
        y1=IntSlider(35, 0, height - 1, continuous_update=False),
        x1=IntSlider(35, 0, width - 1, continuous_update=False),
    )


plot_pcf_processing(SIMULATION_DATA)

import numpy
from numpy import zeros


def ipcf_reference(image_timeseries, circle_coordinates, bins):
    """Return pair correlation function analysis of image time series.

    Cross-correlate the time series of each pixel in the image
    with all its neighbors at a certain radius and return all
    the log-binned and smoothed correlation curves.

    """
    ntimes, height, width = image_timeseries.shape
    npoints = len(circle_coordinates)
    radius = circle_coordinates[0, 0]

    result = zeros(
        (height - 2 * radius, width - 2 * radius, npoints, len(bins)),
        numpy.float32,
    )

    for y in range(radius, height - radius):
        for x in range(radius, width - radius):
            a = image_timeseries[:, y, x]
            for i in range(npoints):
                u, v = circle_coordinates[i]
                b = image_timeseries[:, y + v, x + u]
                c = correlate(b, a)
                result[y - radius, x - radius, i] = smooth(average(c, bins))

    return result


def correlate(a, b):
    """Return normalized circular correlation using DFT."""
    size = a.size
    # forward DFT
    a = numpy.fft.rfft(a)
    b = numpy.fft.rfft(b)
    # multiply by complex conjugate
    c = a * b.conj()
    # reverse DFT
    c = numpy.fft.irfft(c)
    # positive delays only
    c = c[: size // 2]
    # normalize with the averages of a and b
    #   c is already normalized by size
    #   the 0th value of the DFT contains the sum of the signal
    c /= a[0].real * b[0].real / size
    c -= 1.0
    return c


def average(c, bins):
    """Return averaged chunks of array."""
    out = [numpy.mean(c[: bins[0]])]
    for i in range(len(bins) - 1):
        out.append(numpy.mean(c[bins[i] : bins[i + 1]]))
    return out


def smooth(c):
    """Return double exponentially smoothed array."""
    out = c.copy()
    out[0] = out[1]
    for i in range(1, len(out)):
        out[i] = out[i] * 0.3 + out[i - 1] * 0.7
    for i in range(len(out) - 2, -1, -1):
        out[i] = out[i] * 0.3 + out[i + 1] * 0.7
    return out


def logbins(size, nbins):
    """Return up to nbins exponentially increasing integers from 1 to size."""
    b = numpy.logspace(0, math.log(size, 2), nbins, base=2, endpoint=True)
    return numpy.unique(b.astype(numpy.intp))


def circle(radius, npoints):
    """Return cartesian coordinates of circle on integer grid."""
    angles = numpy.linspace(0, 2 * numpy.pi, npoints, endpoint=False)
    coordinates = radius * numpy.array((numpy.cos(angles), numpy.sin(angles)))
    return numpy.ascontiguousarray(
        numpy.round(coordinates).T.astype(numpy.intp)
    )

import numpy


def run_ipcf(
    ipcf_function, image_timeseries, radius=6, npoints=32, nbins=32, **kwargs
):
    """Run ipcf_function on image_timeseries."""
    ntimes, height, width = image_timeseries.shape

    # truncate time axis to power of two
    ntimes = 2 ** int(math.log(ntimes, 2))
    image_timeseries = image_timeseries[:ntimes]

    # calculate circle coordinates
    circle_coordinates = circle(radius, npoints)

    # calculate log-bins
    bins = logbins(ntimes // 2, nbins)

    # run the pair correlation function analysis
    result = ipcf_function(
        image_timeseries, circle_coordinates, bins, **kwargs
    )
    return result


def test_ipcf(result, expected=None, atol=1e-6):
    """Compare ipcf result to known results from file."""
    if expected is None:
        expected = SIMULATION_IPCF_EXPECTED
    if not numpy.allclose(result, expected, atol=atol):
        try:
            plot_ipcf_results(result - expected)
        except NameError:
            print('Test failed')


SIMULATION_IPCF_EXPECTED = rawread(
    os.path.join(DATA_PATH, 'Simulation_Channel.ipcf.bin'),
    (52, 52, 32, 30),
    numpy.float32,
)

%time SIMULATION_IPCF_RESULT = run_ipcf(ipcf_reference, SIMULATION_DATA)

test_ipcf(SIMULATION_IPCF_RESULT)

CPU times: total: 30.3 s
Wall time: 54.3 s

import numpy
from ipywidgets import Dropdown, IntSlider, interact
from matplotlib import pyplot


def plot_ipcf_sprites(ipcf_result, figsize=(6, 5)):
    """Interactively plot pair correlation functions at pixel."""
    height, width, npoints, nbins = ipcf_result.shape

    # data limits
    vmax, vmin = numpy.max(ipcf_result), numpy.min(ipcf_result)
    vminmax = max(abs(vmax), abs(vmin))

    # coordinates for polar plot and Delaunay triangulation
    radius = numpy.arange(nbins)
    angles = numpy.linspace(0, 2 * numpy.pi, npoints, endpoint=False)
    radius, angles = numpy.meshgrid(radius, angles)
    xcoords = radius * numpy.cos(angles)
    ycoords = radius * numpy.sin(-angles)

    def _plot(style, y, x):
        pyplot.figure(figsize=figsize)
        pyplot.title('pair correlation functions at pixel')
        sprite = ipcf_result[y, x]
        if style == 'lines':
            pyplot.plot(sprite.T, 'b')
            pyplot.ylim([vmin, vmax])
            pyplot.xlabel('log time delay index')
            pyplot.ylabel('pcf')
        elif style == 'carpet':
            pyplot.imshow(
                sprite,
                vmin=-vminmax,
                vmax=vminmax,
                cmap='seismic',
                interpolation='none',
            )
            pyplot.xlabel('log time delay index')
            pyplot.ylabel('circle point index')
            pyplot.colorbar()
        elif style == 'polar':
            # polar plot using Delaunay triangulation
            pyplot.tripcolor(
                xcoords.flat,
                ycoords.flat,
                sprite.flat,
                vmin=-vminmax,
                vmax=vminmax,
                shading='gouraud',
                cmap='seismic',
            )
            pyplot.axes().set_aspect('equal')
            pyplot.axis('off')
            pyplot.colorbar()
        pyplot.show()

    interact(
        _plot,
        style=Dropdown(options=['carpet', 'polar', 'lines']),
        y=IntSlider(height // 2, 0, height - 1, continuous_update=False),
        x=IntSlider(width // 2, 0, width - 1, continuous_update=False),
    )


plot_ipcf_sprites(SIMULATION_IPCF_RESULT)

import numpy
from ipywidgets import IntSlider, interact
from matplotlib import pyplot


def plot_ipcf_images(ipcf_result, figsize=(6, 5), interpolation='none'):
    """Interactively plot image of pair correlation function values."""
    height, width, npoints, nbins = ipcf_result.shape
    transpose = height > 1.5 * width

    # data limits
    vmax, vmin = numpy.max(ipcf_result), numpy.min(ipcf_result)
    vminmax = max(abs(vmax), abs(vmin))

    def _plot(point, bin):
        pyplot.figure(figsize=figsize)
        image = ipcf_result[:, :, point, bin]
        if transpose:
            image = image.T
        angle = 360.0 / npoints * point
        pyplot.title('pair correlation function values')
        pyplot.imshow(
            image,
            vmin=-vminmax,
            vmax=vminmax,
            cmap='seismic',
            interpolation=interpolation,
        )
        orientation = 'horizontal' if transpose else 'vertical'
        pyplot.colorbar(orientation=orientation)
        pyplot.show()

    interact(
        _plot,
        point=IntSlider(npoints // 2, 0, npoints - 1, continuous_update=False),
        bin=IntSlider(nbins // 2, 0, nbins - 1, continuous_update=False),
    )


plot_ipcf_images(SIMULATION_IPCF_RESULT)

import numba
import numpy


def ipcf_optimized(image_timeseries, circle_coordinates, bins, **kwargs):
    """Return pair correlation function analysis of image time series."""
    ntimes, height, width = image_timeseries.shape
    npoints = len(circle_coordinates)
    nbins = len(bins)
    radius = circle_coordinates[0, 0]

    result = numpy.zeros(
        (height - 2 * radius, width - 2 * radius, npoints, nbins),
        numpy.float32,
    )

    # make time axis last dimension
    data = numpy.moveaxis(image_timeseries, 0, -1)

    # pre-calculate forward DFT along time axis
    rfft_buffer = numpy.fft.rfft(data, axis=-1)

    for y in range(radius, height - radius):
        for x in range(radius, width - radius):
            rfft_a = rfft_buffer[y, x].conj()

            for i in range(npoints):
                # continue if output was already calculated
                if result[y - radius, x - radius, i, 0] != 0.0:
                    continue

                u, v = circle_coordinates[i]
                rfft_b = rfft_buffer[y + v, x + u]

                # cross-correlate b and a
                c = numpy.fft.irfft(rfft_a * rfft_b)

                scale = ntimes / rfft_a[0].real / rfft_b[0].real

                # positive delays
                average_smooth_scale(
                    c, bins, scale, result[y - radius, x - radius, i]
                )

                # negative delays
                if (
                    radius <= y + v < height - radius
                    and radius <= x + u < width - radius
                ):
                    c = numpy.fft.fftshift(c)
                    i = (i + npoints // 2) % npoints
                    average_smooth_scale(
                        c[ntimes // 2 : 0 : -1],
                        bins,
                        scale,
                        result[y + v - radius, x + u - radius, i],
                    )

    return result


@numba.jit(nogil=True)
def average_smooth_scale(c, bins, scale, out):
    """Average, smooth, and scale correlation function."""
    # average
    out[0] = numpy.mean(c[: bins[0]])
    for i in range(len(bins) - 1):
        out[i + 1] = numpy.mean(c[bins[i] : bins[i + 1]])

    # smooth
    out[0] = out[1]
    for i in range(1, len(bins)):
        out[i] = out[i] * 0.3 + out[i - 1] * 0.7
    for i in range(len(bins) - 2, -1, -1):
        out[i] = out[i] * 0.3 + out[i + 1] * 0.7

    # scale
    out *= scale
    out -= 1.0


%time ipcf_result = run_ipcf(ipcf_optimized, SIMULATION_DATA)

test_ipcf(ipcf_result)

CPU times: total: 3.19 s
Wall time: 5.89 s

%%cython -f --compile-args=-O2 -I. -l./ftt2d  $OPENMP_ARGS
#
#cython: boundscheck=False
#cython: wraparound=False
#cython: cdivision=True

import math

import numpy
from cython.parallel import parallel, prange

cimport numpy
from libc.math cimport sqrt
from libc.stdlib cimport free, malloc


cdef extern from 'fft2d.h':
    void rdft(int n, int isgn, double *a, int *ip, double *w) nogil


def ipcf_cython(
    numpy.uint16_t [:, :, :] image_timeseries not None,
    ssize_t [:, ::1] circle_coordinates not None,
    ssize_t [::1] bins not None,
    int num_threads=0
):
    """Return pair correlation function analysis of image time series."""
    cdef:
        ssize_t ntimes = image_timeseries.shape[0]
        ssize_t height = image_timeseries.shape[1]
        ssize_t width = image_timeseries.shape[2]
        ssize_t nbins = bins.shape[0]
        ssize_t npoints = circle_coordinates.shape[0]
        ssize_t radius = circle_coordinates[0, 0]
        ssize_t x, y, u, v, i, t, x1, y1, t1
        double scale
        double *rfft_a
        double *rfft_b
        double *a_
        double *w_
        int *ip_
        double [:, :, ::1] rdft_
        float[:, :, :, ::1] out
        
    # limit length of time axis to power of two
    ntimes = 2**int(math.log(ntimes, 2))

    if radius < 2:
        raise ValueError('invalid radius')
    if width <= 2*radius or height <= 2*radius:
        raise ValueError('invalid image size')
    if ntimes < 32 or ntimes > 2147483647:
        raise ValueError('invalid size of time axis')

    # output array
    result = numpy.zeros(
        (height-2*radius, width-2*radius, npoints, nbins), 
        dtype=numpy.float32
    )
    out = result

    # buffer for forward DFT
    rdft_ = numpy.empty((height, width, ntimes), dtype=numpy.float64)

    with nogil:
        # rdft cos/sin table
        w_ = <double *>malloc(ntimes // 2 * sizeof(double))
        if not w_:
            with gil:
                raise MemoryError('could not allocate w_')

        # rdft work area for bit reversal
        ip_ = <int*>malloc((2 + <int>(sqrt((ntimes//2) + 0.5))) * sizeof(int))
        if not ip_:
            with gil:
                raise MemoryError('could not allocate ip_')

        # initialize ip_ and w_
        ip_[0] = 0
        rdft(ntimes, 1, &rdft_[0, 0, 0], ip_, w_)


    with nogil, parallel(num_threads=num_threads):
        # thread-local input/output data
        a_ = <double *>malloc(sizeof(double) * ntimes)
        if not a_:
            with gil:
                raise MemoryError('could not allocate a_')

        # forward DFT
        for y1 in prange(height):
            for x1 in range(width):
                for t1 in range(ntimes):
                    rdft_[y1, x1, t1] = <double>image_timeseries[t1, y1, x1]
                rdft(ntimes, 1, &rdft_[y1, x1, 0], ip_, w_)

        # cross-correlation
        for y in prange(radius, height-radius):
            for x in range(radius, width-radius):
                rfft_a = &rdft_[y, x, 0]

                for i in range(npoints):
                    # continue if output was already calculated
                    if out[y-radius, x-radius, i, 0] != 0.0:
                        continue

                    u = x + circle_coordinates[i, 0]
                    v = y + circle_coordinates[i, 1]

                    rfft_b = &rdft_[v, u, 0]

                    # multiply b's DFT by complex conjugate of a's DFT
                    multiply_conj(rfft_b, rfft_a, a_, ntimes)

                    # inverse DFT
                    rdft(ntimes, -1, a_, ip_, w_)

                    scale = 2.0 / (rfft_a[0] * rfft_b[0])

                    # positive delays
                    average_smooth_scale(
                        a_, 
                        ntimes, 
                        bins, 
                        nbins, 
                        scale,
                        out[y-radius, x-radius, i]
                    )

                    # negative delays
                    if (
                        (v >= radius) and 
                        (v < height-radius) and
                        (u >= radius) and 
                        (u < width-radius)
                    ):
                        i = (i + npoints // 2) % npoints
                        average_smooth_scale(
                            a_, 
                            ntimes, 
                            bins, 
                            nbins, 
                            scale,
                            out[v-radius, u-radius, i], 
                            -1
                        )
        free(a_)

    free(w_)
    free(ip_)

    return result


cdef void multiply_conj(
    double *a, 
    double *b, 
    double *c, 
    ssize_t size
) noexcept nogil:
    """Multiply `a` by complex conjugate of `b` and store in `c`."""
    cdef:
        ssize_t i
        double ar, br, ai, bi

    c[0] = a[0] * b[0]
    c[1] = a[1] * b[1]
    for i in range(2, size, 2):
        ar = a[i]
        ai = a[i+1]
        br = b[i]
        bi = b[i+1]
        c[i] = ar * br + ai * bi
        c[i+1] = ai * br - ar * bi


cdef void average_smooth_scale(
    double *a, 
    ssize_t size,
    ssize_t[::1] bins, 
    ssize_t nbins, 
    double scale,
    float[::1] out, 
    int mode=1
) noexcept nogil:
    """Average, smooth, and scale correlation function.

    The first nbins items of the input array are changed.

    """
    cdef:
        ssize_t i, j
        double s

    # average
    if mode == 1:
        # positive delays
        s = 0.0
        for i in range(bins[0]):
            s += a[i]
        a[0] = s / <double>bins[0]
        for j in range(1, nbins):
            s = 0.0
            for i in range(bins[j-1], bins[j]):
                s += a[i]
            a[j] = s / <double>(bins[j] - bins[j-1])
    else:
        # negative delay
        s = a[0]
        for i in range(1, bins[0]):
            s += a[size - i]
        a[0] = <float>(s / <double>bins[0])
        for j in range(1, nbins):
            s = 0.0
            for i in range(bins[j-1], bins[j]):
                s += a[size - i]
            a[j] = s / <double>(bins[j] - bins[j-1])

    # smooth
    a[0] = a[1]
    for i in range(1, nbins):
        a[i] = a[i] * 0.3 + a[i-1] * 0.7
    for i in range(nbins-2, -1, -1):
        a[i] = a[i] * 0.3 + a[i+1] * 0.7

    # copy to output with scaling
    for i in range(nbins):
        out[i] = <float>(a[i] * scale - 1.0)

%time ipcf_result = run_ipcf(ipcf_cython, SIMULATION_DATA, num_threads=MAXCPUS)

test_ipcf(ipcf_result)

CPU times: total: 3.05 s
Wall time: 500 ms

import glob

import numpy
import tifffile
from ipywidgets import IntSlider, interact
from matplotlib import pyplot


def browse_images(filenames, vmin=0, vmax=None, imread=tifffile.imread):
    """Interactively plot series of image files."""
    if not filenames:
        raise ValueError('data files not found')

    def _plot(fileindex=0):
        filename = filenames[fileindex]
        image = imread(filename)
        pyplot.figure(figsize=(8, 6))
        pyplot.title(os.path.split(filename)[-1])
        pyplot.imshow(
            image.T,
            vmin=vmin,
            vmax=vmax,
            cmap='viridis',
            interpolation='lanczos',
        )
        pyplot.colorbar(orientation='horizontal')
        pyplot.show()

    interact(
        _plot,
        fileindex=IntSlider(0, 0, len(filenames) - 1, continuous_update=False),
    )


# sorted list of all TIFF files in SPIM dataset
SPIM_DATASET_NAME = 'nih3t3-egfp_2'
SPIM_FILENAMES = list(
    sorted(
        glob.glob(os.path.join(DATA_PATH, SPIM_DATASET_NAME, 'Pos0', '*.tif'))
    )
)

browse_images(SPIM_FILENAMES)

import matplotlib
import skimage
import skimage.filters
import skimage.morphology
import skimage.restoration
import skimage.segmentation
import tifffile
from ipywidgets import FloatSlider, IntSlider, interact
from matplotlib import pyplot

# global variable where found regions will be stored
REGIONS_FOUND = []


def find_regions(filenames, imread=tifffile.imread):
    """Interactively find regions of interest in image files."""

    def _plot(
        fileindex=0,
        sigma=4.0,
        threshold=0.0,
        closegaps=10,
        minarea=64 * 64,
        pow2size=6,
    ):
        # read image
        image = imread(filenames[fileindex])

        # normalize image
        image = image.astype(numpy.float64)
        image -= image.min()
        image /= image.max()

        # remove noise by smoothing with Gaussian filter
        image = skimage.filters.gaussian(image, sigma)

        # binarize image with intensity threshold
        if threshold == 0.0:
            threshold = image.mean()
            # skimage.filters offers many threshold_* functions:
            #     otsu, li, yen, adaptive, and isodata
        binary = image > threshold

        # close small gaps
        binary = skimage.morphology.closing(
            binary, skimage.morphology.square(closegaps)
        )

        # remove artifacts connected to image border
        skimage.segmentation.clear_border(binary)

        # label image regions
        labels = skimage.measure.label(binary)

        # discard small regions
        regions = (
            r for r in skimage.measure.regionprops(labels) if r.area > minarea
        )

        # sort regions by area
        regions = reversed(sorted(regions, key=lambda x: x.area))

        def expand_bbox(bbox, shape):
            # return bounding box expanded to multiple of modulo
            minrow, mincol, maxrow, maxcol = bbox
            modulo = 2**pow2size
            div, mod = divmod(maxrow - minrow, modulo)
            if mod:
                d = (div + 1) * modulo
                minrow = max(0, minrow - (d - maxrow + minrow) // 2)
                maxrow = min(shape[0] - 1, minrow + d)
                minrow = max(0, maxrow - d)
            div, mod = divmod(maxcol - mincol, modulo)
            if mod:
                d = (div + 1) * modulo
                mincol = max(0, mincol - (d - maxcol + mincol) // 2)
                maxcol = min(shape[1] - 1, mincol + d)
                mincol = max(0, maxcol - d)
            return minrow, mincol, maxrow, maxcol

        # keep only bounding box of regions
        regions = [expand_bbox(r.bbox, image.shape) for r in regions]

        # plot image and regions
        pyplot.figure(figsize=(8, 5))
        pyplot.imshow(
            image.T,
            vmin=0.0,
            vmax=1.0,
            cmap='viridis',
            interpolation='lanczos',
        )
        ax = pyplot.gca()
        for region in regions:
            minrow, mincol, maxrow, maxcol = region
            rect = matplotlib.patches.Rectangle(
                (minrow + 1, mincol + 1),
                maxrow - minrow - 2,
                maxcol - mincol - 2,
                fill=False,
                edgecolor='red',
                linewidth=2,
            )
            ax.add_patch(rect)
        # pyplot.colorbar(orientation='horizontal')
        pyplot.show()

        # store regions in global variable
        global REGIONS_FOUND
        REGIONS_FOUND = regions
        return regions

    interact(
        _plot,
        fileindex=IntSlider(
            0, min=0, max=len(filenames) - 1, continuous_update=False
        ),
        sigma=FloatSlider(
            4, min=0.1, max=16.0, step=0.1, continuous_update=False
        ),
        threshold=FloatSlider(
            0.0, min=0.0, max=1.0, step=0.01, continuous_update=False
        ),
        closegaps=IntSlider(10, min=1, max=20, continuous_update=False),
        minarea=IntSlider(
            64 * 64, min=1, max=256 * 256, continuous_update=False
        ),
        pow2size=IntSlider(6, min=0, max=8, continuous_update=False),
    )


find_regions(SPIM_FILENAMES)

from concurrent.futures import ThreadPoolExecutor

import h5py
import tifffile


def tiff2hdf5(
    hdf5file,
    tifffiles,
    region,
    dataset_name='spim_data',
    chunks=(512, 16, 16),
    max_workers=32,
):
    """Write image region from TIFF files to chunked dataset in HDF5 file."""
    minrow, mincol, maxrow, maxcol = region
    image = tifffile.imread(tifffiles[0])
    nimages = len(tifffiles)
    shape = nimages, maxrow - minrow, maxcol - mincol
    dtype = image.dtype

    with h5py.File(hdf5file, 'w') as hdf:
        if dataset_name in hdf:
            del hdf[dataset_name]

        dataset = hdf.create_dataset(
            dataset_name, shape=shape, dtype=dtype, chunks=chunks
        )
        dataset.attrs['region'] = region
        dataset.attrs['file'] = tifffiles[0]

        def convert_chunk(start, size=chunks[0]):
            # copy size images from TIFF files to HDF5 dataset
            # using a temporary buffer
            temp = numpy.empty(
                shape=(chunks[0], dataset.shape[1], dataset.shape[2]),
                dtype=dataset.dtype,
            )

            for index, fname in enumerate(tifffiles[start : start + size]):
                image = tifffile.imread(fname, key=0)
                temp[index] = image[minrow:maxrow, mincol:maxcol]

            dataset[start : start + size] = temp[: index + 1]

        with ThreadPoolExecutor(max_workers) as executor:
            executor.map(convert_chunk, range(0, nimages, chunks[0]))


def cleanup_hdf(remove=False):
    """Close handles and optionally remove existing HDF5 file."""
    try:
        HDF5_FILE.flush()
    except Exception:
        pass
    try:
        del SPIM_DATASET
    except Exception:
        pass
    try:
        del SPIM_IPCF_RESULT
    except Exception:
        pass
    try:
        HDF5_FILE.close()
    except Exception:
        pass
    if remove:
        try:
            os.remove(HDF5_FILENAME)
        except Exception:
            pass


HDF5_FILENAME = os.path.join(SCRATCH_PATH, SPIM_DATASET_NAME) + '.hdf5'

cleanup_hdf(remove=True)

%time tiff2hdf5(HDF5_FILENAME, SPIM_FILENAMES[1000:21000], REGIONS_FOUND[0])

print(
    '{}  ({:.1f} GB)'.format(
        HDF5_FILENAME, os.path.getsize(HDF5_FILENAME) / 1024**3
    )
)

CPU times: total: 5.73 s
Wall time: 9.59 s
./nih3t3-egfp_2.hdf5  (6.1 GB)

import h5py

# the HDF5 file will stay open until the end of the document
HDF5_FILE = h5py.File(HDF5_FILENAME, 'r+')

SPIM_DATASET = HDF5_FILE['spim_data']

# print information about dataset
print('shape: ', SPIM_DATASET.shape)
print('dtype: ', SPIM_DATASET.dtype)
print('file:  ', SPIM_DATASET.attrs['file'])
print('region:', SPIM_DATASET.attrs['region'])
print()

# slicing datasets returns a numpy array in memory
print('reading a single image:')
%time image = SPIM_DATASET[10000]
print(image.shape)
print()

print('reading a single time series:')
%time timeseries = SPIM_DATASET[:, 400, 100]
print(timeseries.shape)

shape:  (20000, 832, 192)
dtype:  uint16
file:   ./nih3t3-egfp_2\Pos0\img_000001000_Default_000.tif
region: [104 156 936 348]

reading a single image:
CPU times: total: 0 ns
Wall time: 25 ms
(832, 192)

reading a single time series:
CPU times: total: 0 ns
Wall time: 2 ms
(20000,)

from ipywidgets import IntSlider, interact
from matplotlib import pyplot


def imshow_ts(image_timeseries, vmin=0, vmax=None):
    """Interactively plot images in time series."""

    def _plot(index=0):
        image = image_timeseries[index]
        pyplot.figure(figsize=(8, 5))
        pyplot.imshow(
            image.T,
            vmin=vmin,
            vmax=vmax,
            cmap='viridis',
            interpolation='lanczos',
        )
        pyplot.colorbar(orientation='horizontal')
        pyplot.show()

    interact(
        _plot,
        index=IntSlider(
            0, 0, image_timeseries.shape[0] - 1, 100, continuous_update=False
        ),
    )


imshow_ts(SPIM_DATASET)

import numpy
from ipywidgets import IntSlider, interact
from matplotlib import pyplot


def plot_its(image_timeseries, ymax=None):
    """Interactively plot time series at selected pixel."""
    ntimes, height, width = image_timeseries.shape
    pow2 = int(math.log(ntimes, 2))
    t = numpy.arange(ntimes)

    def _plot(y, x, start=0, pow2=pow2):
        pyplot.figure(figsize=(6, 4))
        pyplot.title('time series')
        pyplot.xlabel('time index')
        pyplot.ylabel('intensity')
        stop = min(start + 2**pow2, ntimes)
        pyplot.plot(t[start:stop], image_timeseries[start:stop, y, x])
        pyplot.gca().set_xlim([start, stop - 1])
        pyplot.gca().set_ylim([0, ymax])
        pyplot.show()

    interact(
        _plot,
        y=IntSlider(height // 2, 0, height - 1, continuous_update=False),
        x=IntSlider(width // 2, 0, width - 1, continuous_update=False),
        start=IntSlider(0, 0, ntimes - 2**4, 100, continuous_update=False),
        pow2=IntSlider(pow2, 4, pow2, 1, continuous_update=False),
    )


plot_its(SPIM_DATASET, ymax=2500)

%%cython -f --compile-args=-O2  $OPENMP_ARGS
#
#cython: boundscheck=False
#cython: wraparound=False
#cython: cdivision=True

import numpy
from cython.parallel import parallel, prange

cimport numpy
from libc.math cimport fabs, round, sqrt
from libc.stdlib cimport free, malloc

ctypedef numpy.uint16_t uint16_t


cdef void highpass_filter(
    uint16_t[:] data, 
    uint16_t[::1] out,
    double *a, 
    ssize_t size, 
    ssize_t filtersize
) noexcept nogil:
    """Subtract smoothed, add mean, and correct deficit.

    Using double exponential smoothing with factor 1/filtersize.

    """
    cdef:
        ssize_t i
        ssize_t sumd
        double t
        uint16_t d
        double f0 = 1.0 / <double>filtersize
        double f1 = 1.0 - f0
        double mean
        double deficit

    sumd = data[0]
    a[0] = <double>data[0]
    for i in range(1, size):
        d = data[i]
        sumd += d
        a[i] = <double>d * f0 + a[i-1] * f1
    for i in range(size-2, -1, -1):
        a[i] = a[i] * f0 + a[i+1] * f1

    mean = sumd / size

    for i in range(size):
        t = <double>data[i]
        deficit = sqrt(fabs(mean / a[i]))
        t = round(deficit * (t - a[i]) + mean)
        if t < 0.5:
            t = 0.0
        elif t > 65534.5:
            t = 65535.0
        else:
            t = t + 0.5
        out[i] = <uint16_t>t


def correct_bleaching(
    numpy.ndarray[uint16_t, ndim=3] image_timeseries,
    ssize_t filtersize=1024, 
    int num_threads=0
):
    """Return time series for exponential photobleaching.

    The first and last 'filtersize' samples of 'image_timeseries' 
    are removed.

    """
    cdef:
        uint16_t[:, :, ::1] out
        uint16_t[:, :, ::] data = image_timeseries
        ssize_t ntimes = data.shape[0]
        ssize_t height = data.shape[1]
        ssize_t width = data.shape[2]
        ssize_t t, y, x
        double *a_
        
    if filtersize <= 0:
        return image_timeseries

    # allocate output array with contiguous time axis
    result = numpy.empty((height, width, ntimes), image_timeseries.dtype)
    out = result

    with nogil, parallel(num_threads=num_threads):

        # thread-local input/output data
        a_ = <double *>malloc(sizeof(double) * ntimes)
        if a_ == NULL:
            with gil:
                raise MemoryError('could not allocate a_')

        for y in prange(height):
            for x in range(width):
                highpass_filter(
                    data[:, y, x], out[y, x, :], a_, ntimes, filtersize
                )

        free(a_)

    result = numpy.moveaxis(result, -1, 0)
    result = result[filtersize:-filtersize]
    return result

%time corrected = correct_bleaching(SPIM_DATASET[:, 400:432, 80:112])

plot_its(corrected, ymax=1000)

CPU times: total: 844 ms
Wall time: 85.1 ms

import dask
import dask.array


def run_ipcf_blocked(
    image_timeseries,
    output=None,
    chunks=(32, 32),
    radius=6,
    npoints=32,
    nbins=32,
    filtersize=0,
    ipcf_function=ipcf_cython,
    correct_bleaching=correct_bleaching,
    num_workers=MAXCPUS,
    num_threads=2,
):
    """Run ipcf_function on small overlapping blocks of image timeseries."""
    ntimes, height, width = image_timeseries.shape

    # truncate time axis to power of two
    ntimes = 2 ** int(math.log(ntimes - 2 * filtersize, 2))

    # calculate circle coordinates
    circle_coordinates = circle(radius, npoints)

    # calculate log-bins
    bins = logbins(ntimes // 2, nbins)
    nbins = bins.size

    # create a dask chunked array
    blocks = dask.array.from_array(
        image_timeseries,
        chunks=(image_timeseries.shape[0], chunks[0], chunks[1]),
    )

    # correct bleaching on overlapping blocks
    corrected = blocks.map_overlap(
        correct_bleaching,
        depth=(0, radius, radius),
        boundary=(0, 1, 1),
        trim=False,
        dtype=blocks.dtype,
        filtersize=filtersize,
        num_threads=num_threads,
    )

    # ipfc on corrected, overlapping blocks
    ipcf_result = corrected.map_blocks(
        ipcf_function,
        dtype=numpy.float32,
        chunks=(chunks[0], chunks[1], npoints, nbins),
        drop_axis=0,
        new_axis=[2, 3],
        circle_coordinates=circle_coordinates,
        bins=bins,
        num_threads=num_threads,
    )

    # execute the compute graph or determine shape of result
    if output == 'shape':
        output = ipcf_result.shape
    elif output is None:
        output = ipcf_result.compute(num_workers=num_workers)
    else:
        ipcf_result.store(output, num_workers=num_workers)

    return output

radius = 6

%time ipcf_result = run_ipcf_blocked(SIMULATION_DATA, radius=radius)

test_ipcf(ipcf_result[radius:-radius, radius:-radius])

CPU times: total: 4.66 s
Wall time: 813 ms

args = dict(
    radius=6,
    npoints=32,
    nbins=32,
    filtersize=1024,
    chunks=(64, 64),
    ipcf_function=ipcf_cython,
    num_workers=MAXCPUS,
    num_threads=2,
)

# remove previous results
try:
    del SPIM_IPCF_RESULT
except Exception:
    pass
if 'ipcf_result' in HDF5_FILE:
    del HDF5_FILE['ipcf_result']

# determine size of output array
shape = run_ipcf_blocked(SPIM_DATASET, 'shape')

# allocate a new dataset in the HDF5 file
output = HDF5_FILE.create_dataset(
    'ipcf_result', shape=shape, dtype=numpy.float32
)

# run the analysis with output to HDF5 dataset
%time SPIM_IPCF_RESULT = run_ipcf_blocked(SPIM_DATASET, output, **args)

CPU times: total: 5min 14s
Wall time: 27.3 s

plot_ipcf_sprites(SPIM_IPCF_RESULT)

plot_ipcf_images(SPIM_IPCF_RESULT, figsize=(10, 6), interpolation='lanczos')

cleanup_hdf(remove=False)

%matplotlib inline
%reload_ext Cython

import datetime
import math
import multiprocessing
import os
import shutil
import sys
from distutils import ccompiler

import notebook

print(sys.executable)
print('Python', sys.version, end='\n\n')

for module in (
    'IPython',
    'notebook',
    'ipywidgets',
    'widgetsnbextension',
    'numpy',
    'scipy',
    'matplotlib',
    'skimage',
    'numba',
    'cupy',
    'h5py',
    'Cython',
    'dask',
    'tifffile',
):
    try:
        __import__(module)
    except Exception:
        continue
    lib = sys.modules[module]
    print(module.lower(), getattr(lib, '__version__', 'Unknown'))

print('\nCompiler type:', ccompiler.new_compiler().compiler_type, end='\n\n')
print(multiprocessing.cpu_count(), 'CPU cores')

try:
    import psutil

    print(
        '{:.0f} GB main memory\n'.format(psutil.virtual_memory()[0] / 2**30)
    )
except ImportError:
    pass

try:
    import numba.cuda

    print(numba.cuda.gpus[0].name.decode('utf8'))
except Exception:
    pass

if shutil.which('nvcc'):
    print()
    !nvcc --version

print()
try:
    print('Duration:', datetime.datetime.now() - START_TIME)
    print()
except NameError:
    pass
print(datetime.datetime.now())

X:\Python311\python.exe
Python 3.11.7 (tags/v3.11.7:fa7a6f2, Dec  4 2023, 19:24:49) [MSC v.1937 64 bit (AMD64)]

ipython 8.18.1
notebook 7.0.6
ipywidgets 8.1.1
widgetsnbextension 4.0.9
numpy 1.26.2
scipy 1.11.3
matplotlib 3.8.2
skimage 0.22.0
numba 0.58.1
cupy 12.3.0
h5py 3.10.0
cython 3.0.6
dask 2023.12.0
tifffile 2023.9.26

Compiler type: msvc

28 CPU cores
96 GB main memory

Quadro RTX 5000

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:09:35_Pacific_Daylight_Time_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0

Duration: 0:04:34.902883

2023-12-08 09:25:03.567517

	1 thread / ms	8 threads / ms	speedup
correlate_numpy	21.02	30.06	5.6
correlate_cython	72.74	91.53	6.4
correlate_cython_omp	6.45	53.23	1.0
correlate_numba	108.23	849.19	1.0
correlate_numba_parallel	9.57	99.85	0.8
correlate_numba_cuda	2.68	inf	0.0
correlate_scipy	0.52	2.19	1.9
correlate_fft	0.22	1.43	1.2
correlate_cufft	0.80	inf	0.0
correlate_cython_fft2d	0.14	0.91	1.2

Pair Correlation Function Analysis of Fluorescence Fluctuations in Big Image Time Series using Python¶

Abstract¶

Requirements¶

Familiarity with¶

Minimum computer specifications¶

Python development environment¶

Tutorial source code and data files¶

Configure the runtime environment¶

The Challenge¶

Outline¶

1. Implement a fast cross-correlation function¶

2. Implement pair correlation function analysis of small image time series¶

3. Implement out-of-core pair correlation function analysis of big image time series¶

1. Implement a fast cross-correlation function¶

Definition of cross-correlation¶

Linear and circular cross-correlation¶

Properties of cross-correlation¶

Cross-correlation using pure Python¶

Plot auto-correlation¶

Interactively plot cross-correlation¶

Multi-threading¶

Cross-correlation using C¶

Python lists¶

Why Python?¶

Why do we consider Python for big data image processing and analysis?¶

Cross-correlation using Numpy¶

Numpy multi-threaded¶

Cross-correlation using Cython¶

Using Cython with OpenMP¶

Just-in-time compile Python code using Numba¶

Parallelized numba code¶

Just-in-time compile Python code to CUDA using Numba¶

Switching to frequency domain¶

Cross-correlation using Scipy's convolution function¶

Circular cross-correlation using FFT¶

Circular cross-correlation using CUDA FFT¶

Use Cython with a C FFT library¶

Compare implementations¶

2. Implement pair correlation function analysis of small image time series¶

Load and explore simulated images¶

Process cross-correlation functions for image fluorescence fluctuation analysis¶

Reference implementation of pair correlation image analysis¶

Plot results of image pair correlation function analysis¶

Optimize the ipcf function¶

A fast ipcf function using Cython, OpenMP, and fft2d¶

3. Implement out-of-core pair correlation function analysis of big image time series¶

Browse SPIM time series of images¶

Select image regions of interest¶

Save region from SPIM files as chunked HDF5 file¶

Correct for photobleaching¶

Out-of-core image analysis using Dask¶

Run pair correlation function analysis on SPIM image time series¶

Cleanup¶

Outlook¶

References¶

Image pair correlation function analysis¶

Scientific Computing in Python¶

System information¶

Optimize the `ipcf` function¶

A fast `ipcf` function using Cython, OpenMP, and fft2d¶