Skip to contents

Introduction

This vignette gives some indication of how kd build and query times vary with the length of the query, target input data. These are based on real world data for Drosophila neurons (see Costa et al 2016). Since the source data are large (>16000 neurons) we have prepared a smaller random sample.

Load packages

## Warning in rgl.init(initValue, onlyNULL): RGL: unable to open X11 display
## Warning: 'rgl.init' failed, running with 'rgl.useNULL = TRUE'.

This is how we prepared the sample data:

tf=tempfile(fileext = 'dpscanon.rds')
download.file("http://virtualflybrain.org/data/VFB/nblast/flycircuit/dpscanon.rds", destfile = tf)
dps=readRDS(tf)
set.seed(42)
dps500=sample(dps,500)
saveRDS(dps500, file='dps500.rds')
unlink(tf)

OK now we’re going to use that canned sample. It has been made available by a github release. This allows it to be downloaded directly (rather than bloating the git repository).

tf2=tempfile(fileext = 'dps500.rds')
download.file("https://github.com/jefferis/RANN2/releases/download/v0.12pre/dps500.rds", destfile = tf2)
dps500=readRDS(tf2)
unlink(tf2)

Here we define a utility function borrowed from the nat.nblast package:

neuron_pairs <- function (query, target, n = NA, ignoreSelf = TRUE) 
{
    if (!is.character(query)) 
        query = names(query)
    if (missing(target)) 
        target = query
    else if (!is.character(target)) 
        target = names(target)
    if (is.na(n)) {
        rval = expand.grid(query = query, target = target, stringsAsFactors = FALSE, 
            KEEP.OUT.ATTRS = FALSE)
        if (ignoreSelf) 
            rval <- rval[rval$target != rval$query, ]
        return(rval)
    }
    remove_self = ignoreSelf && any(query %in% target)
    q = sample(query, n, replace = TRUE)
    t = sapply(q, function(z) sample(if (remove_self) 
        setdiff(target, z)
    else target, 1))
    data.frame(query = q, target = t, stringsAsFactors = F)
}

Choosee

np=neuron_pairs(dps500, n = 500)
sl=sapply(dps500, function(x) nrow(x$points))
np$query.len=sl[np$query]
np$target.len=sl[np$target]

Let’s investigate the build time for the kdtree of those neurons

times=llply(np$target, function(n)
  microbenchmark(nn2(dps500[[n]]$points,matrix(0,ncol=3,nrow=1), k=1), times = 20))
np$target.time=sapply(times,function(x) median(x$time)/1e6)
library(ggplot2)
ggplot(np, aes(target.len,target.time)) +
  scale_x_log10() + 
  scale_y_log10('Tree build time /ms') +
  geom_point() + 
  geom_smooth()

l=lm(target.time~target.len, data=np)
l$coefficients[2]
##   target.len 
## 0.0003300401

So in conclusion, ~ 0.5 ms / 1000 points to build.

For queries, total time will include build time, + query time depending O(n) on number of query points and O(n log n) on number of target points.

times2=mlply(np, function(query, target, ...)
  microbenchmark(nn2(dps500[[target]]$points,dps500[[query]]$points, k=1), times = 20))
np$search.time=sapply(times2,function(x) median(x$time)/1e6)
with(np, plot3d(log2(query.len), log2(target.len), log2(search.time)))
set.seed(421)
dps20=sample(dps500, 20)
times3=llply(dps20, function(query, ...)
  microbenchmark(nn2(dps20[[1]]$points,query$points, k=1), times = 20))
df=data.frame(query=names(dps20),target= names(dps20)[1], stringsAsFactors = F)
df$search.time=sapply(times3,function(x) median(x$time)/1e6)
df$query.len=sl[df$query]
ggplot(data=df, aes(query.len, search.time)) +
  scale_y_continuous('Query time /ms') +
  geom_point() + 
  geom_smooth(method = 'lm')
## `geom_smooth()` using formula = 'y ~ x'

l=lm(search.time~query.len, data=df)

Slope is 9.2474049^{-4}.

np40=neuron_pairs(dps20[1:2],dps20)
times4=mlply(np40, function(target, query, ...)
  microbenchmark(nn2(dps20[[target]]$points, dps20[[query]]$points, k=1), times = 20))

np40$search.time=sapply(times4,function(x) median(x$time)/1e6)
np40$target.len=sl[np40$target]
np40$query.len=sl[np40$query]
ggplot(np40, aes(target.len, search.time)) +
  scale_y_continuous('Query time /ms') +
  geom_point() +
  geom_smooth(method = 'lm')
## `geom_smooth()` using formula = 'y ~ x'

l=lm(search.time~target.len, data=np40)

Slope is 3.963059^{-4}.