Skip to content

Commit

Permalink
Pruned intersection (#517)
Browse files Browse the repository at this point in the history
  • Loading branch information
danpovey authored Dec 18, 2020
1 parent 212cf60 commit aa878dc
Show file tree
Hide file tree
Showing 35 changed files with 1,524 additions and 378 deletions.
18 changes: 17 additions & 1 deletion k2/csrc/algorithms.cu
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ void Renumbering::ComputeOld2New() {
num_new_elems_ = old2new_.Back();
K2_CHECK_GE(num_new_elems_, 0);
K2_CHECK_LE(num_new_elems_, keep_.Dim());
old2new_ = old2new_.Range(0, keep_.Dim());
}

namespace {
Expand Down Expand Up @@ -59,4 +58,21 @@ void Renumbering::ComputeNew2Old() {
new2old_ = new2old_.Range(0, num_new_elems_);
}

Renumbering::Renumbering(const Array1<char> &keep,
const Array1<int32_t> &old2new,
const Array1<int32_t> &new2old):
keep_(keep), old2new_(old2new),
num_new_elems_(new2old.Dim()),
new2old_(new2old) { }


Renumbering IdentityRenumbering(ContextPtr c, int32_t size) {
Array1<char> keep(c, size + 1); // uninitialized.
keep = keep.Arange(0, size);
Array1<int32_t> range = Arange(c, 0, size + 1);
return Renumbering(keep, range, range.Arange(0, size));
}



} // namespace k2
36 changes: 28 additions & 8 deletions k2/csrc/algorithms.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ class Renumbering {
Renumbering(Renumbering &&src) = default;
// move assignment
Renumbering &operator=(Renumbering &&) = default;
// copy assignment
Renumbering &operator=(const Renumbering &) = default;


/*
This constructor will allocate memory for `keep_` array with size
Expand Down Expand Up @@ -56,6 +59,16 @@ class Renumbering {
Init(c, num_old_elems, init_keep_with_zero);
}

/*
This constructor is not intended for use by users; it is used by
IdentityRenumbering(). Just sets members to the provided arrays and
num_new_elems_ to new2old.Dim().
*/
Renumbering(const Array1<char> &keep,
const Array1<int32_t> &old2new,
const Array1<int32_t> &new2old);


void Init(ContextPtr c, int32_t num_old_elems,
bool init_keep_with_zero = false) {
NVTX_RANGE(K2_FUNC);
Expand Down Expand Up @@ -100,17 +113,20 @@ class Renumbering {
/* Return a mapping from old index to new index. This is created on demand
(must only be called after the Keep() array has been populated).
@param [in] extra_element If true, will return the array of size
NumOldElems() + 1, which includes one more element;
otherwise it will return an array of size NumOldElems().
This array is just the exclusive sum of Keep().
It gives the mapping for indexes that are kept; element
i is kept if `Old2New()[i+1] > Old2New()[i]`.
@return Returns an array mapping the old indexes to the new indexes.
Its dimension is the number of old indexes (i.e. keep_.Dim()
or NumOldElems()). It is just the exclusive sum of Keep().
It gives the mapping for indexes that are kept; ignore the
non-kept elements of it.
Will be allocated with the same context as keep_.
*/
Array1<int32_t> &Old2New() {
Array1<int32_t> Old2New(bool extra_element = false) {
NVTX_RANGE(K2_FUNC);
if (!old2new_.IsValid()) ComputeOld2New();
return old2new_;
if (extra_element) return old2new_;
else return old2new_.Arange(0, old2new_.Dim() - 1);
}

private:
Expand All @@ -121,12 +137,16 @@ class Renumbering {
Array1<char> keep_; // array of elements to keep; dimension is the
// `num_old_elems` provided in the constructor but it
// was allocated with one extra element.
Array1<int32_t> old2new_;
Array1<int32_t> old2new_; // note: dimension is num-old-elems + 1.
int32_t num_new_elems_; // equals last element of old2new_; set when
// old2new_ is created.
Array1<int32_t> new2old_;
};

// returns a Renumbering object that is the identity map. Caution; its Keep()
// elements are not set up.
Renumbering IdentityRenumbering(ContextPtr c, int32_t size);

} // namespace k2

#endif // K2_CSRC_ALGORITHMS_H_
12 changes: 5 additions & 7 deletions k2/csrc/array.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,6 @@ class Array1 {
@param [in] size Number of elements to include, 0 <= size <= Dim()-start
*/
Array1<T> Range(int32_t start, int32_t size) const {
NVTX_RANGE(K2_FUNC);
K2_CHECK_GE(start, 0);
K2_CHECK_LE(start, Dim());
K2_CHECK_GE(size, 0);
Expand All @@ -120,7 +119,6 @@ class Array1 {
start <= end <= Dim().
*/
Array1<T> Arange(int32_t start, int32_t end) const {
NVTX_RANGE(K2_FUNC);
K2_CHECK_GE(start, 0);
K2_CHECK_LE(start, dim_);
K2_CHECK_GE(end, start);
Expand Down Expand Up @@ -344,9 +342,9 @@ class Array1 {
Array1(const Array1 &) = default;
// move constructor
Array1(Array1 &&) = default;
// assignment operator
// assignment operator (shallow); see Assign() for assignment of elements.
Array1 &operator=(const Array1 &) = default;
// move assignment operator
// move assignment operator (shallow)
Array1 &operator=(Array1 &&) = default;

/*
Expand Down Expand Up @@ -523,7 +521,7 @@ class Array2 {
}

// return a row (indexing on the 0th axis)
Array1<T> operator[](int32_t i) {
Array1<T> Row(int32_t i) {
NVTX_RANGE(K2_FUNC);
K2_CHECK_GE(i, 0);
K2_CHECK_LT(i, dim0_);
Expand Down Expand Up @@ -567,9 +565,9 @@ class Array2 {
Array2(const Array2 &other) = default;
// move constructor
Array2(Array2 &&other) = default;
// assignment operator
// assignment operator (shallow); see Assign() for assignment of elements.
Array2 &operator=(const Array2 &other) = default;
// move assignment operator
// move assignment operator (shallow);
Array2 &operator=(Array2 &&other) = default;

/* stride on 1st axis is 1 (in elements). */
Expand Down
2 changes: 1 addition & 1 deletion k2/csrc/array_inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ std::ostream &operator<<(std::ostream &stream, const Array2<T> &array) {
Array2<T> array_cpu = array.To(GetCpuContext());
int32_t num_rows = array_cpu.Dim0();
for (int32_t i = 0; i < num_rows; ++i) {
stream << ToPrintable(array_cpu[i]);
stream << ToPrintable(array_cpu.Row(i));
if (i + 1 < num_rows) stream << '\n';
}
return stream << "\n]";
Expand Down
7 changes: 7 additions & 0 deletions k2/csrc/array_ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -628,6 +628,13 @@ void Sort(Array1<T> *array, Array1<int32_t> *index_map = nullptr);
template <typename T>
void Assign(Array2<T> &src, Array2<T> *dest);

/*
Assign elements from `src` to `dest`; they must have the same Dim().
*/
template <typename S, typename T>
void Assign(Array1<S> &src, Array1<T> *dest);


/*
Merge an array of Array1<T> with a `merge_map` which indicates which items
to get from which positions (doesn't do any checking of the merge_map values!)
Expand Down
23 changes: 23 additions & 0 deletions k2/csrc/array_ops_inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -878,6 +878,29 @@ void Assign(Array2<T> &src, Array2<T> *dest) {
}
}


template <typename S, typename T>
void Assign(Array1<S> &src, Array1<T> *dest) {
K2_CHECK_EQ(src.Dim(), dest->Dim());
int32_t dim = src.Dim();
if (std::is_same<S,T>::value) {
size_t num_bytes = dim * sizeof(S);
src.Context()->CopyDataTo(num_bytes, src.Data(), dest->Context(),
dest->Data());
} else {
if (!src.Context()->IsCompatible(*dest->Context())) {
Array1<S> src_new = src.To(dest->Context());
Assign(src_new, dest);
}
const S *src_data = src.Data();
T *dest_data = dest->Data();
K2_EVAL(src.Context(), dim, lambda_copy_data, (int32_t i) -> void {
dest_data[i] = src_data[i];
});
}
}


template <typename T>
Array1<T> MergeWithMap(const Array1<uint32_t> &merge_map, int32_t num_srcs,
const Array1<T> **src) {
Expand Down
12 changes: 7 additions & 5 deletions k2/csrc/array_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,7 @@ void TestArray2() {

auto cpu_array = array.To(cpu);
auto cuda_array = array.To(GetCudaContext());
auto cpu_acc = cpu_array.Accessor();

ASSERT_EQ(cpu_array.ElemStride0(), cpu_array.Dim1());
ASSERT_EQ(cuda_array.ElemStride0(), cuda_array.Dim1());
Expand All @@ -285,8 +286,8 @@ void TestArray2() {
for (auto c = 0; c != kDim1; ++c) {
// WARNING: it's inefficient to access elements of Array2
// with operator [][]
EXPECT_EQ(cpu_array[r][c], k);
EXPECT_EQ(cuda_array[r][c], k);
EXPECT_EQ(cpu_acc(r, c), k);
EXPECT_EQ(cuda_array.Row(r)[c], k);
++k;
}

Expand Down Expand Up @@ -316,14 +317,15 @@ void TestArray2() {

auto cpu_array = array.To(cpu);
auto cuda_array = array.To(GetCudaContext());
auto cpu_acc = cpu_array.Accessor();

auto k = 0;
for (auto r = 0; r != kDim0; ++r)
for (auto c = 0; c != kDim1; ++c) {
// WARNING: it's inefficient to access elements of Array2
// with operator [][]
EXPECT_EQ(cpu_array[r][c], k);
EXPECT_EQ(cuda_array[r][c], k);
EXPECT_EQ(cpu_acc(r,c), k);
EXPECT_EQ(cuda_array.Row(r)[c], k);
++k;
}

Expand Down Expand Up @@ -361,7 +363,7 @@ void TestArray2() {
{
// test operator[]
for (int32_t i = 0; i < array.Dim0(); ++i) {
Array1<T> sub_array = array[i];
Array1<T> sub_array = array.Row(i);
const T *sub_array_data = sub_array.Data();
ASSERT_EQ(sub_array.Dim(), array.Dim1());
std::vector<T> sub_array_cpu_data(sub_array.Dim());
Expand Down
7 changes: 4 additions & 3 deletions k2/csrc/benchmark/benchmark.cu
Original file line number Diff line number Diff line change
Expand Up @@ -179,9 +179,10 @@ void PrintEnvironemntInfo() {
os << kPrefix << "torch CUDA version: " << kTorchCudaVersion << "\n";
os << kPrefix << "NVTX enabled: " << kEnableNvtx << "\n";
os << kPrefix << "Debug disabled: " << internal::kDisableDebug << "\n";
os << kPrefix
<< "cuda device sync enabled: " << internal::EnableCudaDeviceSync()
<< "\n";
os << kPrefix << "cuda device sync enabled: "
<< internal::EnableCudaDeviceSync() << "\n";
os << kPrefix << "Checks disabled: " << internal::DisableChecks() << "\n";


// print it to stderr so that it can be redirected
std::cerr << os.str() << "\n";
Expand Down
46 changes: 46 additions & 0 deletions k2/csrc/context.cu
Original file line number Diff line number Diff line change
Expand Up @@ -128,4 +128,50 @@ void GetBlockSizesForLambda2(int32_t m, int32_t n, dim3 *block_dim,
}
}


void Semaphore::Signal(ContextPtr c) {
DeviceType device_type = c->GetDeviceType();
if (device_type_ == kUnk)
device_type_ = device_type;
else
K2_CHECK_EQ(device_type, device_type_)
<< "Semaphore must always be used with the same device type.";
if (device_type == kCuda) {
cudaEvent_t event;
cudaError_t e = cudaEventCreateWithFlags(&event, cudaEventDisableTiming);
K2_CHECK_CUDA_ERROR(e) << "Error creating event";
// Note: this stream is subject to being overridden by With(stream..), see
// class With.
cudaStream_t stream = c->GetCudaStream();
e = cudaEventRecord(event, stream);
K2_CHECK_CUDA_ERROR(e) << "Error recording event.";
std::lock_guard<std::mutex> lock(events_mutex_);
events_.push_back(event);
}
semaphore_.release();
}

void Semaphore::Wait(ContextPtr c) {
DeviceType device_type = c->GetDeviceType();
if (device_type_ == kUnk)
device_type_ = device_type;
else
K2_CHECK_EQ(device_type, device_type_)
<< "Semaphore must always be used with the same device type.";
semaphore_.acquire();
if (device_type == kCuda) {
cudaEvent_t event;
{
std::lock_guard<std::mutex> lock(events_mutex_);
K2_CHECK(!events_.empty()); // would be code bug.
event = events_.front();
events_.pop_front();
}
int flags = 0;
cudaError_t e = cudaStreamWaitEvent(c->GetCudaStream(), event,
flags);
K2_CHECK_CUDA_ERROR(e) << "Error waiting on event.";
}
}

} // namespace k2
37 changes: 36 additions & 1 deletion k2/csrc/context.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,17 @@

#include <algorithm>
#include <cassert>
#include <deque>
#include <map>
#include <memory>
#include <mutex>
#include <ostream>
#include <type_traits>
#include <vector>

#include "k2/csrc/log.h"
#include "k2/csrc/nvtx.h"
#include "k2/csrc/semaphore.h"

namespace k2 {

Expand Down Expand Up @@ -363,7 +366,10 @@ inline DeviceType DeviceOf(const T &t) {

// This is for use by ParallelRunner and Context. Users probably should not
// interact with this directly. The idea is that the Context object will call
// this to possibly override its default thread. The
// this to possibly override its default thread. The user would
// create a new stream by calling ParallelRunner's NewStream() method, and
// do `With w(stream);` which calls Push(stream), and later Pop(stream) when it
// goes out of scope.
class CudaStreamOverride {
public:
inline cudaStream_t OverrideStream(cudaStream_t stream) {
Expand Down Expand Up @@ -397,6 +403,35 @@ class With {
cudaStream_t stream_;
};


/*
Our class Semaphore is a slight extension of std::counting_semaphore that also
takes care of stream synchronization. The projected use-case is when two
threads (possibly with different CUDA streams, if we are using CUDA) have a
producer-consumer relationship, such that one is waiting for the other.
The projected use is:
- Construct semaphore
- Producing thread (maybe repeatedly) calls semaphore.Signal(ctx);
- Consuming thread (maybe repeatedly) calls semaphore.Wait(ctx);
*/
class Semaphore {
public:
Semaphore(): device_type_(kUnk), semaphore_(0) { }

void Signal(ContextPtr c);

void Wait(ContextPtr c);

private:
DeviceType device_type_; // makes sure it's always used with the same device
// type.
k2std::counting_semaphore semaphore_;
std::mutex events_mutex_;
std::deque<cudaEvent_t> events_;
};



/*
Class ParallelRunner allows you to invoke CUDA kernels in parallel.
It works for CUDA and CPU, but for CPU it currently just executes things
Expand Down
11 changes: 9 additions & 2 deletions k2/csrc/fsa_algo.h
Original file line number Diff line number Diff line change
Expand Up @@ -161,8 +161,15 @@ void AddEpsilonSelfLoops(FsaOrVec &src, FsaOrVec *dest,
Dim0() as b_fsas. Elements of it may be empty if the
composition was empty, either intrinsically or due to
failure of pruned search.
@param[out] arc_map_a Vector of
@param[out] arc_map_a Will be set to a vector with Dim() equal to
the number of arcs in `out`, whose elements contain
the corresponding arc_idx01 in a_fsas.
@param[out] arc_map_b Will be set to a vector with Dim() equal to
the number of arcs in `out`, whose elements contain
the corresponding arc-index in b_fsas; this arc-index
is defined as the offset into b_fsas.scores, which is
well defined if the shape is known because we require
it to be contiguous.
*/
void IntersectDensePruned(FsaVec &a_fsas, DenseFsaVec &b_fsas,
float search_beam, float output_beam,
Expand Down
Loading

0 comments on commit aa878dc

Please sign in to comment.