emil-e · P-Andersson · Mar 16, 2016 · emil-e · Mar 19, 2016 · emil-e
diff --git a/include/rapidcheck/detail/Unicode.h b/include/rapidcheck/detail/Unicode.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <string>
+
+#include "rapidcheck/detail/BitStream.h"
+
+namespace rc {
+namespace detail {
+
+/// By using a bitstream this function will return a single 
+/// Unicode codepoint, with lower values having a higher chance
+/// to appear than the higher ones. Most results will be
+/// within the the basic multilingual plane, though
+/// any valid Unicode codepoint may be generated.
+template<typename T, typename RandomType>
+T generateCodePoint(rc::detail::BitStream<RandomType>& stream);
+
+/// Converts a codepoint into a string containing the utf8
+/// encoding of passed codepoint.
+template<typename T, typename Y>
+T makeCharacterUtf8(Y codepoint);
+
+} // namespace detail
+} // namespace rc
+
+
+#include "Unicode.hpp"
diff --git a/include/rapidcheck/detail/Unicode.hpp b/include/rapidcheck/detail/Unicode.hpp
@@ -0,0 +1,96 @@
+#pragma once
+
+namespace rc {
+namespace detail {
+
+template<typename T, typename RandomType>
+T generateCodePoint(rc::detail::BitStream<RandomType>& stream)
+{
+	static_assert(sizeof T >= 3, 
+		"Code points can only be stored in types at leeast three bytes large.");
+
+	// Note, this algorithm is designed to provide
+	// good values for UTF8 encoding but can be
+	// used to generate any Unicode character
+	int maxBytes = 1;
+
+	T codepoint;
+	while (maxBytes < 4)
+	{
+		bool increase = stream.next<bool>();
+		if (!increase)
+		{
+			break;
+		}
+		maxBytes += 1;
+	}
+	int noBits;
+	switch (maxBytes)
+	{
+	case 1:
+		noBits = 7;
+		break;
+	case 2:
+		noBits = 11;
+		break;
+	case 3:
+		noBits = 16;
+		break;
+	default:
+		noBits = 20;
+		// Actually 21, put the first bit
+		// needs to be specially handled
+		// to not exceed the valid
+		// value range for codepoints
+		bool highestBit = stream.next<bool>();
+		if (highestBit)
+		{
+			return 0x100000 | stream.next<T>(16);
+		}
+
+	}
+
+	do
+	{
+		codepoint = stream.next<T>(noBits);
+	} while (codepoint == 0);
+	return codepoint;
+}
+
+template<typename T, typename Y>
+T makeCharacterUtf8(Y codepoint)
+{
+	using ValType = T::value_type;
+	if (codepoint <= 0x7F)
+	{
+		return{ static_cast<ValType>(codepoint) };
+	}
+	else if (codepoint <= 0x7FF)
+	{
+		return{
+			static_cast<ValType>(0b11000000 | ((codepoint >> (6)) & 0b00011111)),
+			static_cast<ValType>(0b10000000 | ((codepoint) & 0b00111111))
+		};
+	}
+	else if (codepoint <= 0xFFFF)
+	{
+		return{
+			static_cast<ValType>(0b11100000 | ((codepoint >> (6 + 6)) & 0b00001111)),
+			static_cast<ValType>(0b10000000 | ((codepoint >> (6)) & 0b00111111)),
+			static_cast<ValType>(0b10000000 | ((codepoint) & 0b00111111))
+		};
+	}
+	else if (codepoint <= 0x10FFFF)
+	{
+		return{
+			static_cast<ValType>(0b11110000 | ((codepoint >> (6+6+6)) & 0b00000111)),
+			static_cast<ValType>(0b10000000 | ((codepoint >> (6+6)) & 0b00111111)),
+			static_cast<ValType>(0b10000000 | ((codepoint >> (6)) & 0b00111111)),
+			static_cast<ValType>(0b10000000 | ((codepoint) & 0b00111111))
+		};
+	}
+	return T();
+}
+
+} // namespace detail
+} // namespace rc
diff --git a/include/rapidcheck/gen/Text.h b/include/rapidcheck/gen/Text.h
@@ -10,13 +10,33 @@ namespace gen {
 template <typename T>
 Gen<T> character();
 
+/// Generator of Unicode Codepoint values. It has a higher chance
+/// of generating lower value codepoints.
+template <typename T>
+Gen<T> unicodeCodepoint();
+
+/// Generator of a container of Unicode Codepoint values.
+template <typename Container>
+Gen<Container> unicodeCodepoints();
+
+/// Generator of Unicode text characters, encoded in utf8. 
+/// Will return them in a string of variable length.
+template <typename String>
+Gen<String> characterUtf8();
+
 /// Generator of strings. Essentially equivalent to
 /// `gen::container<String>(gen::character<typename String::value_type>())` but
 /// a lot faster. If you need to use a custom character generator, use
 /// `gen::container`.
 template <typename String>
 Gen<String> string();
 
+/// Generator of strings, as `gen::string<String>()`
+/// but will be filled with utf8 encoded Unicode
+template <typename String>
+Gen<String> stringUtf8();
+
+
 } // namespace gen
 } // namespace rc
 

diff --git a/include/rapidcheck/gen/Text.hpp b/include/rapidcheck/gen/Text.hpp
@@ -5,13 +5,18 @@
 #include "rapidcheck/detail/BitStream.h"
 #include "rapidcheck/gen/Container.h"
 
+#include "rapidcheck/detail/Unicode.h"
+
 namespace rc {
 namespace gen {
 namespace detail {
 
 template <typename String>
 class StringGen;
 
+template <typename Container>
+class ContainerCodepointGen;
+
 template <typename T, typename... Args>
 class StringGen<std::basic_string<T, Args...>> {
 public:
@@ -41,6 +46,32 @@ class StringGen<std::basic_string<T, Args...>> {
   }
 };
 
+template <typename T, typename... Args>
+class ContainerCodepointGen<std::vector<T, Args...>> {
+public:
+	using Container = std::vector<T, Args...>;
+
+	Shrinkable<Container> operator()(const Random &random, int size) const {
+		auto stream = rc::detail::bitStreamOf(random);
+		Container str;
+		auto length = stream.next<std::size_t>() % (size + 1);
+		str.reserve(length);
+
+		for (std::size_t i = 0; i < length; i++) {
+			str.push_back(rc::detail::generateCodePoint<T>(stream));
+		}
+
+		return shrinkable::shrinkRecur(
+			std::move(str),
+			[](const Container &s) {
+			return seq::concat(shrink::removeChunks(s),
+				shrink::eachElement(s, &shrink::unicodeCodepoint<T>));
+		});
+	}
+};
+
+
+
 template <typename T, typename... Args>
 struct DefaultArbitrary<std::basic_string<T, Args...>> {
   static Gen<std::basic_string<T, Args...>> arbitrary() {
@@ -64,11 +95,52 @@ Gen<T> character() {
   };
 }
 
+
+template <typename T>
+Gen<T> unicodeCodepoint() {
+	return [](const Random &random, int size) {
+		auto stream = ::rc::detail::bitStreamOf(random);
+
+		return shrinkable::shrinkRecur(rc::detail::generateCodePoint<T>(stream),
+			&shrink::unicodeCodepoint<T>);
+	};
+}
+
+template <typename Container>
+Gen<Container> unicodeCodepoints()
+{
+	return detail::ContainerCodepointGen<Container>();
+}
+
+template <typename String>
+Gen<String> characterUtf8() {
+	return map(unicodeCodepoint(), [](T codepoint)
+	{
+		return rc::detail::makeCharacterUtf8<String>(codepoint);
+	});
+}
+
+
 template <typename String>
 Gen<String> string() {
   return detail::StringGen<String>();
 }
 
+template <typename String>
+Gen<String> stringUtf8() {
+	return map(unicodeCodepoints<std::vector<uint32_t>>(), [](const std::vector<uint32_t>& codepoints)
+	{
+		String str;
+		for (const auto& cp : codepoints)
+		{
+			str += rc::detail::makeCharacterUtf8<String>(cp);
+		}
+		return std::move(str);
+	});
+}
+
+
+
 } // namespace gen
 } // namespace rc
 

diff --git a/include/rapidcheck/shrink/Shrink.h b/include/rapidcheck/shrink/Shrink.h
@@ -2,6 +2,8 @@
 
 #include "rapidcheck/Seq.h"
 
+#include <cstdint>
+
 namespace rc {
 namespace shrink {
 
@@ -50,6 +52,10 @@ inline Seq<bool> boolean(bool value);
 template <typename T>
 Seq<T> character(T value);
 
+/// Shrinks a unicode codepoint
+template <typename T>
+Seq<T> unicodeCodepoint(T value);
+
 } // namespace shrink
 } // namespace rc
 

diff --git a/include/rapidcheck/shrink/Shrink.hpp b/include/rapidcheck/shrink/Shrink.hpp
@@ -198,5 +198,17 @@ Seq<T> character(T value) {
   return seq::takeWhile(std::move(shrinks), [=](T x) { return x != value; });
 }
 
+/// Shrinks a unicode codepoint
+template <typename T>
+Seq<T> unicodeCodepoint(T value){
+	auto shrinks = seq::cast<T>(seq::concat(
+		seq::fromContainer(std::vector<T>({'a', 'b', 'c'})),
+		seq::fromContainer(std::vector<T>({'A', 'B', 'C', 
+			'1', '2', '3', ' ', '\n' })),
+		seq::filter(towards(value, static_cast<T>(1)), [](const T& val) 
+				{ return val != static_cast<T>(0); } )));
+
+	return seq::takeWhile(std::move(shrinks), [=](T x) { return x != value; });
+}
 } // namespace shrink
 } // namespace rc
diff --git a/src/gen/Text.cpp b/src/gen/Text.cpp
@@ -1,5 +1,7 @@
 #include "rapidcheck/gen/Text.h"
 
+#include "rapidcheck/detail/Unicode.h"
+
 template rc::Gen<std::string> rc::gen::string<std::string>();
 template rc::Gen<std::wstring> rc::gen::string<std::wstring>();
 template struct rc::Arbitrary<std::string>;

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
@@ -43,6 +43,7 @@ add_executable(rapidcheck_tests
   detail/TestMetadataTests.cpp
   detail/TestParamsTests.cpp
   detail/TestingTests.cpp
+  detail/UnicodeTests.cpp
   detail/VariantTests.cpp
   fn/CommonTests.cpp
   gen/BuildTests.cpp