dipy · 36000 · May 7, 2026 · Apr 28, 2026 · Apr 28, 2026 · May 2, 2026
diff --git a/cuslines/__init__.py b/cuslines/__init__.py
@@ -30,15 +30,14 @@ def _detect_backend():
         pass
     return None
 
-
 BACKEND = _detect_backend()
 
 if BACKEND == "metal":
     from cuslines.metal import (
         MetalBootDirectionGetter as BootDirectionGetter,
     )
     from cuslines.metal import (
-        MetalGPUTracker as GPUTracker,
+        MetalGPUTracker as Tracker,
     )
     from cuslines.metal import (
         MetalProbDirectionGetter as ProbDirectionGetter,
@@ -49,7 +48,7 @@ def _detect_backend():
 elif BACKEND == "cuda":
     from cuslines.cuda_python import (
         BootDirectionGetter,
-        GPUTracker,
+        GPUTracker as Tracker,
         ProbDirectionGetter,
         PttDirectionGetter,
     )
@@ -64,18 +63,24 @@ def _detect_backend():
         WebGPUPttDirectionGetter as PttDirectionGetter,
     )
     from cuslines.webgpu import (
-        WebGPUTracker as GPUTracker,
+        WebGPUTracker as Tracker,
     )
 else:
-    raise ImportError(
-        "No GPU backend available. Install either:\n"
-        "  - CUDA: pip install 'cuslines[cu13]' (NVIDIA GPU)\n"
-        "  - Metal: pip install 'cuslines[metal]' (Apple Silicon)\n"
-        "  - WebGPU: pip install 'cuslines[webgpu]' (cross-platform)"
+    from cuslines.numba import (
+        CPUBootDirectionGetter as BootDirectionGetter,
+    )
+    from cuslines.numba import (
+        CPUProbDirectionGetter as ProbDirectionGetter,
+    )
+    from cuslines.numba import (
+        CPUPttDirectionGetter as PttDirectionGetter,
+    )
+    from cuslines.numba import (
+        CPUTracker as Tracker,
     )
 
 __all__ = [
-    "GPUTracker",
+    "Tracker",
     "ProbDirectionGetter",
     "PttDirectionGetter",
     "BootDirectionGetter",

diff --git a/cuslines/cuda_c/boot.cu b/cuslines/cuda_c/boot.cu
@@ -582,8 +582,7 @@ __device__ int get_direction_boot_d(
                 const int ndir = peak_directions_d<BDIM_X,
                                                    BDIM_Y>(__h_sh, dirs,
                                                            sphere_vertices,
-                                                           sphere_edges,
-							   reinterpret_cast<int *>(__r_sh)); // reuse __r_sh as shInd in func which is large enough
+                                                           sphere_edges);
                 if (NATTEMPTS == 1) { // init=True...
                         return ndir; // and dirs;
                 } else { // init=False...

diff --git a/cuslines/cuda_c/generate_streamlines_cuda.cu b/cuslines/cuda_c/generate_streamlines_cuda.cu
@@ -64,8 +64,8 @@ __device__ int get_direction_prob_d(curandStatePhilox4_32_10_t *st,
         const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
         const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
 
-	extern __shared__ REAL_T __sh[];
-        REAL_T *__pmf_data_sh = __sh + tidy*N32DIMT;
+        __shared__ REAL_T pmf_data_sh[BDIM_Y][DIMT];
+        REAL_T* __pmf_data_sh = pmf_data_sh[tidy];
 
         // pmf = self.pmf_gen.get_pmf_c(&point[0], pmf)
         __syncwarp(WMASK);
@@ -94,13 +94,11 @@ __device__ int get_direction_prob_d(curandStatePhilox4_32_10_t *st,
         __syncwarp(WMASK);
 
         if (IS_START) {
-                int *__shInd = reinterpret_cast<int *>(__sh + BDIM_Y*N32DIMT) + tidy*N32DIMT;
                 return peak_directions_d<BDIM_X,
                                          BDIM_Y>(__pmf_data_sh,
                                                  dirs,
                                                  sphere_vertices,
-                                                 sphere_edges,
-                                                 __shInd);
+                                                 sphere_edges);
         } else {
                 REAL_T __tmp;
                 #ifdef DEBUG
@@ -148,7 +146,7 @@ __device__ int get_direction_prob_d(curandStatePhilox4_32_10_t *st,
                                            dir.y*sphere_vertices[i].y+
                                            dir.z*sphere_vertices[i].z;
 
-                        if (FABS(dot) < cos_similarity) {
+                        if (APPLY_ABS_IF_SYM(dot) < cos_similarity) {
                                 __pmf_data_sh[i] = 0.0;
                         }
                 }

diff --git a/cuslines/cuda_c/globals.h b/cuslines/cuda_c/globals.h
@@ -67,6 +67,12 @@
 
 #endif
 
+#if SPHERE_SYMM == 0
+  #define APPLY_ABS_IF_SYM(x) (x)
+#else
+  #define APPLY_ABS_IF_SYM(x) FABS(x)
+#endif
+
 #define MIN(x,y) (((x)<(y))?(x):(y))
 #define MAX(x,y) (((x)>(y))?(x):(y))
 #define POW2(n) (1 << (n))

diff --git a/cuslines/cuda_c/ptt.cu b/cuslines/cuda_c/ptt.cu
@@ -31,12 +31,14 @@ __device__ float interp4_d(const float3 pos, const float* frame,
     };
     const int odf_idx = static_cast<int>(tex3D<float>(*sphere_vertices_lut, uvw.z, uvw.y, uvw.x));
 
-    const int grid_col = odf_idx & WIDTH_MASK;
-    const int grid_row = odf_idx >> LOG2_WIDTH;
-
-    const float x_query = (float)(grid_col * DIMX) + pos.x;
-    const float y_query = (float)(grid_row * DIMY) + pos.y;
-    return tex3D<float>(*pmf, x_query, y_query, pos.z);
+    const int tx = odf_idx & WIDTH_MASK;
+    const int ty = (odf_idx >> LOG2_X) & HEIGHT_MASK;
+    const int tz = (odf_idx >> (LOG2_X + LOG2_Y));
+
+    const float x_query = (float)(tx * DIMX) + pos.x;
+    const float y_query = (float)(ty * DIMY) + pos.y;
+    const float z_query = (float)(tz * DIMZ) + pos.z;
+    return tex3D<float>(*pmf, x_query, y_query, z_query);
 }
 
 __device__ void prepare_propagator_d(float k1, float k2, float arclength,
@@ -231,6 +233,8 @@ __device__ int get_direction_ptt_d(
         }
     }
 
+    __syncwarp(WMASK);
+
     const float first_val = interp4_d(
         pos, __frame_sh, pmf,
         sphere_vertices_lut);
@@ -287,17 +291,14 @@ __device__ int get_direction_ptt_d(
 
     // Move vert to face
     for (int ii = tidx; ii < DISC_FACE_CNT; ii+=BDIM_X) {
-        bool all_verts_valid = 1;
         for (int jj = 0; jj < 3; jj++) {
             float vert_val = __vert_pdf_sh[DISC_FACE[ii*3 + jj]];
             if (vert_val == 0) {
-                all_verts_valid = IS_INIT; // On init, even go with faces that are not fully supported
+                __face_cdf_sh[ii] = 0;
+                break;
             }
             __face_cdf_sh[ii] += vert_val;
         }
-        if (!all_verts_valid) {
-            __face_cdf_sh[ii] = 0;
-        }
     }
     __syncwarp(WMASK);
 
@@ -456,15 +457,16 @@ __device__ bool init_frame_ptt_d(
         }
     } else {
         if (tidx == 0) {
-            for (int ii = 0; ii < 9; ii++) {
+            for (int ii = 0; ii < 6; ii++) {
                 __frame[ii] = -__frame[ii];
             }
         }
         __syncwarp(WMASK);
     }
     if (tidx == 0) {
-        for (int ii = 0; ii < 9; ii++) {
+        for (int ii = 0; ii < 6; ii++) {
             __frame[9+ii] = -__frame[ii]; // save flipped frame for second run
+            // note that we only flip the tangent and normal, not the binormal
         }
     }
     __syncwarp(WMASK);

diff --git a/cuslines/cuda_c/ptt_init.cu b/cuslines/cuda_c/ptt_init.cu
@@ -27,33 +27,33 @@ __global__ void getNumStreamlinesPtt_k( const int nseed,
         curandStatePhilox4_32_10_t st;
         curand_init(RNG_SEED, gid, 0, &st);
 
-        extern __shared__ REAL_T __sh[];
-        REAL_T *__pmf_data_sh = __sh + tidy*N32DIMT;
+        __shared__ REAL_T pmf_data_sh[BDIM_Y][DIMT];
+        REAL_T* __pmf_data_sh = pmf_data_sh[tidy];
 
         REAL3_T point = seeds[slid];
 
         #pragma unroll
         for (int i = tidx; i < DIMT; i += BDIM_X) {
-                const int grid_col = i & WIDTH_MASK;
-                const int grid_row = i >> LOG2_WIDTH;
-
-                const REAL_T x_query = (REAL_T)(grid_col * DIMX) + point.x;
-                const REAL_T y_query = (REAL_T)(grid_row * DIMY) + point.y;
-                __pmf_data_sh[i] = tex3D<REAL_T>(*pmf, x_query, y_query, point.z);
+                const int tx = i & WIDTH_MASK;
+                const int ty = (i >> LOG2_X) & HEIGHT_MASK;
+                const int tz = (i >> (LOG2_X + LOG2_Y));
+
+                const REAL_T x_query = (REAL_T)(tx * DIMX) + point.x;
+                const REAL_T y_query = (REAL_T)(ty * DIMY) + point.y;
+                const REAL_T z_query = (REAL_T)(tz * DIMZ) + point.z;
+                __pmf_data_sh[i] = tex3D<REAL_T>(*pmf, x_query, y_query, z_query);
                 if (__pmf_data_sh[i] < PMF_THRESHOLD_P) {
                         __pmf_data_sh[i] = 0.0;
                 }
         }
         __syncwarp(WMASK);
 
-        int *__shInd = reinterpret_cast<int *>(__sh + BDIM_Y*N32DIMT) + tidy*N32DIMT;
         int ndir = peak_directions_d<
             BDIM_X,
             BDIM_Y>(__pmf_data_sh,
                     __shDir,
                     sphere_vertices,
-                    sphere_edges,
-                    __shInd);
+                    sphere_edges);
 
         if (tidx == 0) {
                 slineOutOff[slid] = ndir;

diff --git a/cuslines/cuda_c/tracking_helpers.cu b/cuslines/cuda_c/tracking_helpers.cu
@@ -79,17 +79,18 @@ template<int BDIM_X,
 __device__ int peak_directions_d(const REAL_T  *__restrict__ odf,
                                        REAL3_T *__restrict__ dirs,
                                  const REAL3_T *__restrict__ sphere_vertices,
-                                 const int2 *__restrict__ sphere_edges,
-				 int *__restrict__ __shInd) {
+                                 const int2 *__restrict__ sphere_edges) {
 
         const int tidx = threadIdx.x;
+        const int tidy = threadIdx.y;
 
         const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
         const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
 
         const unsigned int lmask = (1 << lid)-1;
 
-//        __shared__ int __shInd[BDIM_Y][SAMPLM_NR];
+       __shared__ int shInd[BDIM_Y][SAMPLM_NR];
+       int* __shInd = shInd[tidy];
 
         #pragma unroll
         for(int j = tidx; j < SAMPLM_NR; j += BDIM_X) {
@@ -206,9 +207,10 @@ __device__ int peak_directions_d(const REAL_T  *__restrict__ odf,
 
                                 int j = 0;
                                 for(; j < k; j++) {
-                                        const REAL_T cos = FABS(abc.x*dirs[j].x+
-                                                                abc.y*dirs[j].y+
-                                                                abc.z*dirs[j].z);
+                                        const REAL_T cos = APPLY_ABS_IF_SYM(
+                                                abc.x*dirs[j].x+
+                                                abc.y*dirs[j].y+
+                                                abc.z*dirs[j].z);
                                         if (cos > cos_similarity) {
                                                 break;
                                         }
@@ -232,7 +234,7 @@ __device__ int peak_directions_d(const REAL_T  *__restrict__ odf,
         __syncwarp(WMASK);
 */
 #else
-        const int indMax = max_d<BDIM_X, SAMPLM_NR>(__shInd[tidy], -1);
+        const int indMax = max_d<BDIM_X, SAMPLM_NR>(SAMPLM_NR, __shInd, -1);
         if (indMax != -1) {
                 __ret = MAKE_REAL3(sphere_vertices[indMax][0],
                                    sphere_vertices[indMax][1],