Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 15 additions & 10 deletions cuslines/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,14 @@ def _detect_backend():
pass
return None


BACKEND = _detect_backend()

if BACKEND == "metal":
from cuslines.metal import (
MetalBootDirectionGetter as BootDirectionGetter,
)
from cuslines.metal import (
MetalGPUTracker as GPUTracker,
MetalGPUTracker as Tracker,
)
from cuslines.metal import (
MetalProbDirectionGetter as ProbDirectionGetter,
Expand All @@ -49,7 +48,7 @@ def _detect_backend():
elif BACKEND == "cuda":
from cuslines.cuda_python import (
BootDirectionGetter,
GPUTracker,
GPUTracker as Tracker,
ProbDirectionGetter,
PttDirectionGetter,
)
Expand All @@ -64,18 +63,24 @@ def _detect_backend():
WebGPUPttDirectionGetter as PttDirectionGetter,
)
from cuslines.webgpu import (
WebGPUTracker as GPUTracker,
WebGPUTracker as Tracker,
)
else:
raise ImportError(
"No GPU backend available. Install either:\n"
" - CUDA: pip install 'cuslines[cu13]' (NVIDIA GPU)\n"
" - Metal: pip install 'cuslines[metal]' (Apple Silicon)\n"
" - WebGPU: pip install 'cuslines[webgpu]' (cross-platform)"
from cuslines.numba import (
CPUBootDirectionGetter as BootDirectionGetter,
)
from cuslines.numba import (
CPUProbDirectionGetter as ProbDirectionGetter,
)
from cuslines.numba import (
CPUPttDirectionGetter as PttDirectionGetter,
)
from cuslines.numba import (
CPUTracker as Tracker,
)

__all__ = [
"GPUTracker",
"Tracker",
"ProbDirectionGetter",
"PttDirectionGetter",
"BootDirectionGetter",
Expand Down
3 changes: 1 addition & 2 deletions cuslines/cuda_c/boot.cu
Original file line number Diff line number Diff line change
Expand Up @@ -582,8 +582,7 @@ __device__ int get_direction_boot_d(
const int ndir = peak_directions_d<BDIM_X,
BDIM_Y>(__h_sh, dirs,
sphere_vertices,
sphere_edges,
reinterpret_cast<int *>(__r_sh)); // reuse __r_sh as shInd in func which is large enough
sphere_edges);
if (NATTEMPTS == 1) { // init=True...
return ndir; // and dirs;
} else { // init=False...
Expand Down
10 changes: 4 additions & 6 deletions cuslines/cuda_c/generate_streamlines_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,8 @@ __device__ int get_direction_prob_d(curandStatePhilox4_32_10_t *st,
const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));

extern __shared__ REAL_T __sh[];
REAL_T *__pmf_data_sh = __sh + tidy*N32DIMT;
__shared__ REAL_T pmf_data_sh[BDIM_Y][DIMT];
REAL_T* __pmf_data_sh = pmf_data_sh[tidy];

// pmf = self.pmf_gen.get_pmf_c(&point[0], pmf)
__syncwarp(WMASK);
Expand Down Expand Up @@ -94,13 +94,11 @@ __device__ int get_direction_prob_d(curandStatePhilox4_32_10_t *st,
__syncwarp(WMASK);

if (IS_START) {
int *__shInd = reinterpret_cast<int *>(__sh + BDIM_Y*N32DIMT) + tidy*N32DIMT;
return peak_directions_d<BDIM_X,
BDIM_Y>(__pmf_data_sh,
dirs,
sphere_vertices,
sphere_edges,
__shInd);
sphere_edges);
} else {
REAL_T __tmp;
#ifdef DEBUG
Expand Down Expand Up @@ -148,7 +146,7 @@ __device__ int get_direction_prob_d(curandStatePhilox4_32_10_t *st,
dir.y*sphere_vertices[i].y+
dir.z*sphere_vertices[i].z;

if (FABS(dot) < cos_similarity) {
if (APPLY_ABS_IF_SYM(dot) < cos_similarity) {
__pmf_data_sh[i] = 0.0;
}
}
Expand Down
6 changes: 6 additions & 0 deletions cuslines/cuda_c/globals.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,12 @@

#endif

#if SPHERE_SYMM == 0
#define APPLY_ABS_IF_SYM(x) (x)
#else
#define APPLY_ABS_IF_SYM(x) FABS(x)
#endif

#define MIN(x,y) (((x)<(y))?(x):(y))
#define MAX(x,y) (((x)>(y))?(x):(y))
#define POW2(n) (1 << (n))
Expand Down
28 changes: 15 additions & 13 deletions cuslines/cuda_c/ptt.cu
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,14 @@ __device__ float interp4_d(const float3 pos, const float* frame,
};
const int odf_idx = static_cast<int>(tex3D<float>(*sphere_vertices_lut, uvw.z, uvw.y, uvw.x));

const int grid_col = odf_idx & WIDTH_MASK;
const int grid_row = odf_idx >> LOG2_WIDTH;

const float x_query = (float)(grid_col * DIMX) + pos.x;
const float y_query = (float)(grid_row * DIMY) + pos.y;
return tex3D<float>(*pmf, x_query, y_query, pos.z);
const int tx = odf_idx & WIDTH_MASK;
const int ty = (odf_idx >> LOG2_X) & HEIGHT_MASK;
const int tz = (odf_idx >> (LOG2_X + LOG2_Y));

const float x_query = (float)(tx * DIMX) + pos.x;
const float y_query = (float)(ty * DIMY) + pos.y;
const float z_query = (float)(tz * DIMZ) + pos.z;
return tex3D<float>(*pmf, x_query, y_query, z_query);
}

__device__ void prepare_propagator_d(float k1, float k2, float arclength,
Expand Down Expand Up @@ -231,6 +233,8 @@ __device__ int get_direction_ptt_d(
}
}

__syncwarp(WMASK);

const float first_val = interp4_d(
pos, __frame_sh, pmf,
sphere_vertices_lut);
Expand Down Expand Up @@ -287,17 +291,14 @@ __device__ int get_direction_ptt_d(

// Move vert to face
for (int ii = tidx; ii < DISC_FACE_CNT; ii+=BDIM_X) {
bool all_verts_valid = 1;
for (int jj = 0; jj < 3; jj++) {
float vert_val = __vert_pdf_sh[DISC_FACE[ii*3 + jj]];
if (vert_val == 0) {
all_verts_valid = IS_INIT; // On init, even go with faces that are not fully supported
__face_cdf_sh[ii] = 0;
break;
}
__face_cdf_sh[ii] += vert_val;
}
if (!all_verts_valid) {
__face_cdf_sh[ii] = 0;
}
}
__syncwarp(WMASK);

Expand Down Expand Up @@ -456,15 +457,16 @@ __device__ bool init_frame_ptt_d(
}
} else {
if (tidx == 0) {
for (int ii = 0; ii < 9; ii++) {
for (int ii = 0; ii < 6; ii++) {
__frame[ii] = -__frame[ii];
}
}
__syncwarp(WMASK);
}
if (tidx == 0) {
for (int ii = 0; ii < 9; ii++) {
for (int ii = 0; ii < 6; ii++) {
__frame[9+ii] = -__frame[ii]; // save flipped frame for second run
// note that we only flip the tangent and normal, not the binormal
}
}
__syncwarp(WMASK);
Expand Down
22 changes: 11 additions & 11 deletions cuslines/cuda_c/ptt_init.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,33 +27,33 @@ __global__ void getNumStreamlinesPtt_k( const int nseed,
curandStatePhilox4_32_10_t st;
curand_init(RNG_SEED, gid, 0, &st);

extern __shared__ REAL_T __sh[];
REAL_T *__pmf_data_sh = __sh + tidy*N32DIMT;
__shared__ REAL_T pmf_data_sh[BDIM_Y][DIMT];
REAL_T* __pmf_data_sh = pmf_data_sh[tidy];

REAL3_T point = seeds[slid];

#pragma unroll
for (int i = tidx; i < DIMT; i += BDIM_X) {
const int grid_col = i & WIDTH_MASK;
const int grid_row = i >> LOG2_WIDTH;

const REAL_T x_query = (REAL_T)(grid_col * DIMX) + point.x;
const REAL_T y_query = (REAL_T)(grid_row * DIMY) + point.y;
__pmf_data_sh[i] = tex3D<REAL_T>(*pmf, x_query, y_query, point.z);
const int tx = i & WIDTH_MASK;
const int ty = (i >> LOG2_X) & HEIGHT_MASK;
const int tz = (i >> (LOG2_X + LOG2_Y));

const REAL_T x_query = (REAL_T)(tx * DIMX) + point.x;
const REAL_T y_query = (REAL_T)(ty * DIMY) + point.y;
const REAL_T z_query = (REAL_T)(tz * DIMZ) + point.z;
__pmf_data_sh[i] = tex3D<REAL_T>(*pmf, x_query, y_query, z_query);
if (__pmf_data_sh[i] < PMF_THRESHOLD_P) {
__pmf_data_sh[i] = 0.0;
}
}
__syncwarp(WMASK);

int *__shInd = reinterpret_cast<int *>(__sh + BDIM_Y*N32DIMT) + tidy*N32DIMT;
int ndir = peak_directions_d<
BDIM_X,
BDIM_Y>(__pmf_data_sh,
__shDir,
sphere_vertices,
sphere_edges,
__shInd);
sphere_edges);

if (tidx == 0) {
slineOutOff[slid] = ndir;
Expand Down
16 changes: 9 additions & 7 deletions cuslines/cuda_c/tracking_helpers.cu
Original file line number Diff line number Diff line change
Expand Up @@ -79,17 +79,18 @@ template<int BDIM_X,
__device__ int peak_directions_d(const REAL_T *__restrict__ odf,
REAL3_T *__restrict__ dirs,
const REAL3_T *__restrict__ sphere_vertices,
const int2 *__restrict__ sphere_edges,
int *__restrict__ __shInd) {
const int2 *__restrict__ sphere_edges) {

const int tidx = threadIdx.x;
const int tidy = threadIdx.y;

const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));

const unsigned int lmask = (1 << lid)-1;

// __shared__ int __shInd[BDIM_Y][SAMPLM_NR];
__shared__ int shInd[BDIM_Y][SAMPLM_NR];
int* __shInd = shInd[tidy];

#pragma unroll
for(int j = tidx; j < SAMPLM_NR; j += BDIM_X) {
Expand Down Expand Up @@ -206,9 +207,10 @@ __device__ int peak_directions_d(const REAL_T *__restrict__ odf,

int j = 0;
for(; j < k; j++) {
const REAL_T cos = FABS(abc.x*dirs[j].x+
abc.y*dirs[j].y+
abc.z*dirs[j].z);
const REAL_T cos = APPLY_ABS_IF_SYM(
abc.x*dirs[j].x+
abc.y*dirs[j].y+
abc.z*dirs[j].z);
if (cos > cos_similarity) {
break;
}
Expand All @@ -232,7 +234,7 @@ __device__ int peak_directions_d(const REAL_T *__restrict__ odf,
__syncwarp(WMASK);
*/
#else
const int indMax = max_d<BDIM_X, SAMPLM_NR>(__shInd[tidy], -1);
const int indMax = max_d<BDIM_X, SAMPLM_NR>(SAMPLM_NR, __shInd, -1);
if (indMax != -1) {
__ret = MAKE_REAL3(sphere_vertices[indMax][0],
sphere_vertices[indMax][1],
Expand Down
Loading
Loading