Skip to content

Commit ec9c8e0

Browse files
Merge pull request #20 from ax3l/release-1.0.2crp
Release 1.0.2crp
2 parents bbf1a38 + cdef072 commit ec9c8e0

File tree

7 files changed

+89
-22
lines changed

7 files changed

+89
-22
lines changed

.travis.yml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
language: cpp
2+
3+
compiler:
4+
- gcc
5+
6+
script:
7+
- mkdir build_tmp && cd build_tmp
8+
- cmake $TRAVIS_BUILD_DIR
9+
- make
10+
11+
before_script:
12+
- sudo apt-get update -qq
13+
- sudo apt-get install -qq build-essential
14+
- sudo apt-get install -qq gcc-4.4 g++-4.4
15+
- sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.4 60 --slave /usr/bin/g++ g++ /usr/bin/g++-4.4
16+
- gcc --version && g++ --version
17+
- sudo apt-get install -qq nvidia-common
18+
- sudo apt-get install -qq nvidia-current
19+
- sudo apt-get install -qq nvidia-cuda-toolkit nvidia-cuda-dev
20+
- sudo find /usr/ -name libcuda*.so

CHANGELOG.md

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
Change Log / Release Log for ScatterAlloc
2+
================================================================
3+
4+
1.0.2crp
5+
-------------
6+
**Date:** 2014-01-07
7+
8+
This is our first bug fix release.
9+
We closed all issues documented in
10+
[Milestone *Bug fixes*](https://github.com/ComputationalRadiationPhysics/scatteralloc/issues?milestone=1&state=closed)
11+
12+
### Changes to 1.0.1
13+
14+
**Features:**
15+
- added travis-ci.org support for compile tests #7
16+
17+
**Bug fixes:**
18+
- broken cmake/compile #1
19+
- g++ warnings #10
20+
- only N-1 access blocks used instead of N #2
21+
- 32bit bug: allocate more than 4GB #12
22+
23+
**Misc:**
24+
See the full changes at
25+
https://github.com/ComputationalRadiationPhysics/scatteralloc/compare/1.0.1...1.0.2crp

CMakeLists.txt

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
project(ScatterAlloc)
22
cmake_minimum_required(VERSION 2.6)
3-
set(CUDA_NVCC_FLAGS "-arch=sm_20;-use_fast_math;")
3+
44
find_package(CUDA REQUIRED)
5+
set(CUDA_NVCC_FLAGS "-arch=sm_20;-use_fast_math;")
6+
set(CUDA_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR})
57
include_directories(${CUDA_INCLUDE_DIRS})
68
cuda_include_directories(${CUDA_INCLUDE_DIRS})
79

@@ -17,7 +19,4 @@ if(NOT ${CUDA_OPTIMIZATION_TYPE} STREQUAL "unset")
1719
endif()
1820

1921
cuda_add_executable(ScatterAllocExample
20-
example.cu
21-
${CMAKE_CURRENT_SOURCE_DIR}/tools/heap.cuh
22-
${CMAKE_CURRENT_SOURCE_DIR}/tools/heap_impl.cuh
23-
${CMAKE_CURRENT_SOURCE_DIR}/tools/utils.h)
22+
example.cu )

README.md

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,23 @@ ScatterAlloc
33

44
ScatterAlloc: Massively Parallel Dynamic Memory Allocation for the GPU
55

6+
This project provides a **fast memory manager** for **Nvidia GPUs** with
7+
compute capability `sm_20` or higher.
8+
9+
From http://www.icg.tugraz.at/project/mvp/downloads :
10+
```quote
11+
ScatterAlloc is a dynamic memory allocator for the GPU. It is
12+
designed concerning the requirements of massively parallel
13+
execution.
14+
15+
ScatterAlloc greatly reduces collisions and congestion by
16+
scattering memory requests based on hashing. It can deal with
17+
thousands of GPU-threads concurrently allocating memory and its
18+
execution time is almost independent of the thread count.
19+
20+
ScatterAlloc is open source and easy to use in your CUDA projects.
21+
```
22+
623
Original Homepage: http://www.icg.tugraz.at/project/mvp
724

825
Our Homepage: https://www.hzdr.de/crp
@@ -15,8 +32,8 @@ This repository is a
1532
[fork](https://en.wikipedia.org/wiki/Fork_%28software_development%29)
1633
of the **ScatterAlloc** project from the
1734
[Managed Volume Processing](http://www.icg.tugraz.at/project/mvp)
18-
group at [Institute for Computer Graphics](http://www.icg.tugraz.at) and
19-
Vision, TU Graz (kudos!).
35+
group at [Institute for Computer Graphics and Vision](http://www.icg.tugraz.at),
36+
TU Graz (kudos!).
2037

2138
Our aim is to improve the implementation, add new features and to fix some
2239
minor bugs.

example.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,4 +128,4 @@ void runexample(int cuda_device)
128128
CUDA_CHECKED_CALL(cudaDeviceSynchronize());
129129
freeSomething<<<grid,block>>>(data);
130130
CUDA_CHECKED_CALL(cudaDeviceSynchronize());
131-
}
131+
}

tools/heap.cuh

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,11 @@
44
55
Copyright (C) 2012 Institute for Computer Graphics and Vision,
66
Graz University of Technology
7+
Copyright (C) 2014 Institute of Radiation Physics,
8+
Helmholtz-Zentrum Dresden - Rossendorf
79
810
Author(s): Markus Steinberger - steinberger ( at ) icg.tugraz.at
11+
Rene Widera - r.widera ( at ) hzdr.de
912
1013
Permission is hereby granted, free of charge, to any person obtaining a copy
1114
of this software and associated documentation files (the "Software"), to deal
@@ -29,6 +32,7 @@
2932
#ifndef HEAP_CUH
3033
#define HEAP_CUH
3134

35+
#include <stdio.h>
3236
#include "tools/utils.h"
3337

3438
namespace GPUTools
@@ -114,7 +118,7 @@ namespace GPUTools
114118
volatile uint* _regions;
115119
PAGE* _page;
116120
uint _numpages;
117-
uint _memsize;
121+
size_t _memsize;
118122
uint _pagebasedMutex;
119123
volatile uint _firstFreePageBased;
120124
volatile uint _firstfreeblock;
@@ -171,7 +175,6 @@ namespace GPUTools
171175
return -1;
172176
spot = nextspot(old, spot, spots);
173177
}
174-
return -1;
175178
}
176179

177180
/**
@@ -273,7 +276,7 @@ namespace GPUTools
273276
{
274277
for(uint b = startblock; b < accessblocks; ++b)
275278
{
276-
while(ptetry < b*pagesperblock)
279+
while(ptetry < (b+1)*pagesperblock)
277280
{
278281
uint region = ptetry/regionsize;
279282
uint regionfilllevel = _regions[region];
@@ -579,7 +582,7 @@ namespace GPUTools
579582
//take care of padding
580583
bytes = (bytes + dataAlignment - 1) & ~(dataAlignment-1);
581584

582-
bool use_coalescing = false;
585+
bool can_use_coalescing = false;
583586
uint myoffset = 0;
584587
uint warpid = GPUTools::warpid();
585588

@@ -592,15 +595,15 @@ namespace GPUTools
592595
if (coalescible && threadcount > 1)
593596
{
594597
myoffset = atomicAdd(&warp_sizecounter[warpid], bytes);
595-
use_coalescing = true;
598+
can_use_coalescing = true;
596599
}
597600

598601
uint req_size = bytes;
599-
if (use_coalescing)
602+
if (can_use_coalescing)
600603
req_size = (myoffset == 16) ? warp_sizecounter[warpid] : 0;
601604

602605
char* myalloc = (char*)alloc_internal_direct(req_size);
603-
if (req_size && use_coalescing)
606+
if (req_size && can_use_coalescing)
604607
{
605608
warp_res[warpid] = myalloc;
606609
if (myalloc != 0)
@@ -609,7 +612,7 @@ namespace GPUTools
609612
__threadfence_block();
610613

611614
void *myres = myalloc;
612-
if(use_coalescing)
615+
if(can_use_coalescing)
613616
{
614617
if(warp_res[warpid] != 0)
615618
myres = warp_res[warpid] + myoffset;
@@ -661,7 +664,7 @@ namespace GPUTools
661664
* @param memory pointer to the memory used for the heap
662665
* @param memsize size of the memory in bytes
663666
*/
664-
__device__ void init(void* memory, uint memsize)
667+
__device__ void init(void* memory, size_t memsize)
665668
{
666669
uint linid = threadIdx.x + blockDim.x*(threadIdx.y + threadIdx.z*blockDim.y);
667670
uint threads = blockDim.x*blockDim.y*blockDim.z;
@@ -711,7 +714,7 @@ namespace GPUTools
711714
_pagebasedMutex = 0;
712715
_firstFreePageBased = numpages-1;
713716

714-
if(_page[numpages].data - 1 >= (char*)(memory) + memsize)
717+
if( (char*) (_page+numpages) > (char*)(memory) + memsize)
715718
printf("error in heap alloc: numpages too high\n");
716719
}
717720

@@ -748,7 +751,7 @@ namespace GPUTools
748751
* global init heap method
749752
*/
750753
template<uint pagesize, uint accessblocks, uint regionsize, uint wastefactor, bool use_coalescing, bool resetfreedpages>
751-
__global__ void initHeap(DeviceHeap<pagesize, accessblocks, regionsize, wastefactor, use_coalescing, resetfreedpages>* heap, void* heapmem, uint memsize)
754+
__global__ void initHeap(DeviceHeap<pagesize, accessblocks, regionsize, wastefactor, use_coalescing, resetfreedpages>* heap, void* heapmem, size_t memsize)
752755
{
753756
heap->init(heapmem, memsize);
754757
}

tools/heap_impl.cuh

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,11 @@
44
55
Copyright (C) 2012 Institute for Computer Graphics and Vision,
66
Graz University of Technology
7+
Copyright (C) 2014 Institute of Radiation Physics,
8+
Helmholtz-Zentrum Dresden - Rossendorf
79
810
Author(s): Markus Steinberger - steinberger ( at ) icg.tugraz.at
11+
Rene Widera - r.widera ( at ) hzdr.de
912
1013
Permission is hereby granted, free of charge, to any person obtaining a copy
1114
of this software and associated documentation files (the "Software"), to deal
@@ -54,11 +57,11 @@ void* initHeap(size_t memsize = 8*1024U*1024U)
5457
#ifdef __CUDACC__
5558
#ifdef OVERWRITE_MALLOC
5659
#if __CUDA_ARCH__ >= 200
57-
__device__ void* malloc(size_t t)
60+
__device__ void* malloc(size_t t) __THROW
5861
{
5962
return theHeap.alloc(t);
6063
}
61-
__device__ void free(void* p)
64+
__device__ void free(void* p) __THROW
6265
{
6366
theHeap.dealloc(p);
6467
}
@@ -96,4 +99,4 @@ __device__ void operator delete[](void* mem, GPUTools::DeviceHeap<pagesize, acce
9699

97100
#endif //__CUDACC__
98101

99-
#endif //HEAP_IMPL_CUH
102+
#endif //HEAP_IMPL_CUH

0 commit comments

Comments
 (0)