diff --git a/src/cow_http2_machine.erl b/src/cow_http2_machine.erl
index 808c6cf..b08538b 100644
--- a/src/cow_http2_machine.erl
+++ b/src/cow_http2_machine.erl
@@ -293,17 +293,38 @@ init_upgrade_stream(Method, State=#http2_machine{mode=server, remote_streamid=0,
 	| {error, {stream_error, cow_http2:streamid(), cow_http2:error(), atom()}, State}
 	| {error, {connection_error, cow_http2:error(), atom()}, State}
 	when State::http2_machine().
+
+%-define(HTTP2_MACHINE_DEBUG, 1).
+-ifdef(HTTP2_MACHINE_DEBUG).
+-define(LOG_FRAME(Frame, State),
+	begin
+		Frame2 = case Frame of
+			{data,_,_,_} -> setelement(4, Frame, {'BINARY-DATA', byte_size(element(4, Frame))});
+			{continuation,_,_,_} -> setelement(4, Frame, {'BINARY-DATA', byte_size(element(4, Frame))});
+			_ -> Frame
+		end,
+		io:format(user, "~p rcv: ~p~n", [State#http2_machine.mode, Frame2])
+	end
+).
+-else.
+-define(LOG_FRAME(Frame, State), _ = Frame).
+-endif.
+
 frame(Frame, State=#http2_machine{state=settings, preface_timer=TRef}) ->
+	?LOG_FRAME(Frame, State),
 	ok = case TRef of
 		undefined -> ok;
 		_ -> erlang:cancel_timer(TRef, [{async, true}, {info, false}])
 	end,
 	settings_frame(Frame, State#http2_machine{state=normal, preface_timer=undefined});
 frame(Frame, State=#http2_machine{state={continuation, _, _}}) ->
+	?LOG_FRAME(Frame, State),
 	maybe_discard_result(continuation_frame(Frame, State));
-frame(settings_ack, State=#http2_machine{state=normal}) ->
+frame(Frame = settings_ack, State=#http2_machine{state=normal}) ->
+	?LOG_FRAME(Frame, State),
 	settings_ack_frame(State);
 frame(Frame, State=#http2_machine{state=normal}) ->
+	?LOG_FRAME(Frame, State),
 	Result = case element(1, Frame) of
 		data -> data_frame(Frame, State);
 		headers -> headers_frame(Frame, State);
diff --git a/src/cow_qs.erl b/src/cow_qs.erl
index 2eb52b8..dff840a 100644
--- a/src/cow_qs.erl
+++ b/src/cow_qs.erl
@@ -13,6 +13,7 @@
 %% OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 
 -module(cow_qs).
+-dialyzer(no_improper_lists).
 
 -export([parse_qs/1]).
 -export([qs/1]).
diff --git a/src/cow_uri.erl b/src/cow_uri.erl
index 790d7de..1864139 100644
--- a/src/cow_uri.erl
+++ b/src/cow_uri.erl
@@ -13,6 +13,7 @@
 %% OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 
 -module(cow_uri).
+-dialyzer(no_improper_lists).
 
 -export([urldecode/1]).
 -export([urlencode/1]).
diff --git a/src/cow_ws.erl b/src/cow_ws.erl
index 27c7c87..47c9e89 100644
--- a/src/cow_ws.erl
+++ b/src/cow_ws.erl
@@ -72,6 +72,8 @@
 -type utf8_state() :: 0..8 | undefined.
 -export_type([utf8_state/0]).
 
+-compile({inline, [utf8_class/0]}).
+
 %% @doc Generate a key for the Websocket handshake request.
 
 -spec key() -> binary().
@@ -518,13 +520,17 @@ unmask(Data, MaskKey, 0) ->
 	mask(Data, MaskKey, <<>>);
 %% We unmask on the fly so we need to continue from the right mask byte.
 unmask(Data, MaskKey, UnmaskedLen) ->
-	Left = UnmaskedLen rem 4,
-	Right = 4 - Left,
-	MaskKey2 = (MaskKey bsl (Left * 8)) + (MaskKey bsr (Right * 8)),
+	Left = (UnmaskedLen rem 4) * 8,
+	Right = 32 - Left,
+	MaskKey2 = (MaskKey bsl Left) + (MaskKey bsr Right),
 	mask(Data, MaskKey2, <<>>).
 
-mask(<<>>, _, Unmasked) ->
-	Unmasked;
+mask(<< O1:32, O2:32, O3:32, O4:32, Rest/bits >>, MaskKey, Acc) ->
+	T1 = O1 bxor MaskKey,
+	T2 = O2 bxor MaskKey,
+	T3 = O3 bxor MaskKey,
+	T4 = O4 bxor MaskKey,
+	mask(Rest, MaskKey, << Acc/binary, T1:32, T2:32, T3:32, T4:32 >>);
 mask(<< O:32, Rest/bits >>, MaskKey, Acc) ->
 	T = O bxor MaskKey,
 	mask(Rest, MaskKey, << Acc/binary, T:32 >>);
@@ -539,7 +545,9 @@ mask(<< O:16 >>, MaskKey, Acc) ->
 mask(<< O:8 >>, MaskKey, Acc) ->
 	<< MaskKey2:8, _:24 >> = << MaskKey:32 >>,
 	T = O bxor MaskKey2,
-	<< Acc/binary, T:8 >>.
+	<< Acc/binary, T:8 >>;
+mask(<<>>, _, Unmasked) ->
+	Unmasked.
 
 inflate_frame(Data, Inflate, TakeOver, FragState, true)
 		when FragState =:= undefined; element(1, FragState) =:= fin ->
@@ -559,14 +567,14 @@ validate_payload(Payload, Rest, undefined, _, _, _, true) ->
 	{ok, Payload, undefined, Rest};
 %% Text frames and close control frames MUST have a payload that is valid UTF-8.
 validate_payload(Payload, Rest, Utf8State, _, Type, _, Eof) when Type =:= text; Type =:= close ->
-	case validate_utf8(Payload, Utf8State) of
+	case validate_text(Payload, Utf8State) of
 		1 -> {error, badencoding};
 		Utf8State2 when not Eof -> {more, Payload, Utf8State2};
 		0 when Eof -> {ok, Payload, 0, Rest};
 		_ -> {error, badencoding}
 	end;
 validate_payload(Payload, Rest, Utf8State, _, fragment, {Fin, text, _}, Eof) ->
-	case validate_utf8(Payload, Utf8State) of
+	case validate_text(Payload, Utf8State) of
 		1 -> {error, badencoding};
 		0 when Eof -> {ok, Payload, 0, Rest};
 		Utf8State2 when Eof, Fin =:= nofin -> {ok, Payload, Utf8State2, Rest};
@@ -581,36 +589,158 @@ validate_payload(Payload, Rest, Utf8State, _, _, _, true) ->
 %% Based on the Flexible and Economical UTF-8 Decoder algorithm by
 %% Bjoern Hoehrmann <bjoern@hoehrmann.de> (http://bjoern.hoehrmann.de/utf-8/decoder/dfa/).
 %%
-%% The original algorithm has been unrolled into all combinations of values for C and State
-%% each with a clause. The common clauses were then grouped together.
+%% The original algorithm has been reworked to better adapt to
+%% the current Erlang VM (at the time of writing).
+%%
+%% We keep the character class table to quickly find which class
+%% a character is. The transition table was removed in favor of
+%% a separate Erlang function per state as that proved more
+%% efficient.
+%%
+%% We store the character class table in a tuple returned by
+%% an inline function.
+%%
+%% We handle ASCII characters specially because when ASCII
+%% characters are present we are highly likely to have mostly
+%% or only ASCII characters. We process them 4 at a time when
+%% possible.
+%%
+%% When a non-ASCII character is encountered, we switch to
+%% the UTF-8 decoder. When in the UTF-8 decoder we have to
+%% process characters one at a time. When we are in the UTF-8
+%% decoder we expect there to be additional UTF-8 characters
+%% so we check for them instead of reverting back to ASCII
+%% every time. This greatly speeds up decoding of Japanese
+%% and other non-ASCII text.
+%%
+%% Our UTF-8 decoder functions consist of looking up the
+%% character class of the current byte and then using a
+%% case clause to determine which state we are switching to.
+%%
+%% We order clauses based on the likelihood of the character class.
+%% Order is determined by the number of occurrences of the class in
+%% the table. The order (and number of occurrences) is as follow:
+%% 7 (32), 2 (30), 1 and 9 (16), 3 (14), 8 (13), 6 (3), 4, 5, 10 and 11.
 %%
 %% This function returns 0 on success, 1 on error, and 2..8 on incomplete data.
-validate_utf8(<<>>, State) -> State;
-validate_utf8(<< C, Rest/bits >>, 0) when C < 128 -> validate_utf8(Rest, 0);
-validate_utf8(<< C, Rest/bits >>, 2) when C >= 128, C < 144 -> validate_utf8(Rest, 0);
-validate_utf8(<< C, Rest/bits >>, 3) when C >= 128, C < 144 -> validate_utf8(Rest, 2);
-validate_utf8(<< C, Rest/bits >>, 5) when C >= 128, C < 144 -> validate_utf8(Rest, 2);
-validate_utf8(<< C, Rest/bits >>, 7) when C >= 128, C < 144 -> validate_utf8(Rest, 3);
-validate_utf8(<< C, Rest/bits >>, 8) when C >= 128, C < 144 -> validate_utf8(Rest, 3);
-validate_utf8(<< C, Rest/bits >>, 2) when C >= 144, C < 160 -> validate_utf8(Rest, 0);
-validate_utf8(<< C, Rest/bits >>, 3) when C >= 144, C < 160 -> validate_utf8(Rest, 2);
-validate_utf8(<< C, Rest/bits >>, 5) when C >= 144, C < 160 -> validate_utf8(Rest, 2);
-validate_utf8(<< C, Rest/bits >>, 6) when C >= 144, C < 160 -> validate_utf8(Rest, 3);
-validate_utf8(<< C, Rest/bits >>, 7) when C >= 144, C < 160 -> validate_utf8(Rest, 3);
-validate_utf8(<< C, Rest/bits >>, 2) when C >= 160, C < 192 -> validate_utf8(Rest, 0);
-validate_utf8(<< C, Rest/bits >>, 3) when C >= 160, C < 192 -> validate_utf8(Rest, 2);
-validate_utf8(<< C, Rest/bits >>, 4) when C >= 160, C < 192 -> validate_utf8(Rest, 2);
-validate_utf8(<< C, Rest/bits >>, 6) when C >= 160, C < 192 -> validate_utf8(Rest, 3);
-validate_utf8(<< C, Rest/bits >>, 7) when C >= 160, C < 192 -> validate_utf8(Rest, 3);
-validate_utf8(<< C, Rest/bits >>, 0) when C >= 194, C < 224 -> validate_utf8(Rest, 2);
-validate_utf8(<< 224, Rest/bits >>, 0) -> validate_utf8(Rest, 4);
-validate_utf8(<< C, Rest/bits >>, 0) when C >= 225, C < 237 -> validate_utf8(Rest, 3);
-validate_utf8(<< 237, Rest/bits >>, 0) -> validate_utf8(Rest, 5);
-validate_utf8(<< C, Rest/bits >>, 0) when C =:= 238; C =:= 239 -> validate_utf8(Rest, 3);
-validate_utf8(<< 240, Rest/bits >>, 0) -> validate_utf8(Rest, 6);
-validate_utf8(<< C, Rest/bits >>, 0) when C =:= 241; C =:= 242; C =:= 243 -> validate_utf8(Rest, 7);
-validate_utf8(<< 244, Rest/bits >>, 0) -> validate_utf8(Rest, 8);
-validate_utf8(_, _) -> 1.
+%% It expects a starting state value of 0. It can be called again
+%% to stream parse large amounts of text as long as the returned
+%% 2..8 state is provided when it is called back.
+
+validate_text(Text, 0) -> validate_ascii(Text);
+validate_text(Text, 2) -> validate_s2(Text);
+validate_text(Text, 3) -> validate_s3(Text);
+validate_text(Text, 4) -> validate_s4(Text);
+validate_text(Text, 5) -> validate_s5(Text);
+validate_text(Text, 6) -> validate_s6(Text);
+validate_text(Text, 7) -> validate_s7(Text);
+validate_text(Text, 8) -> validate_s8(Text).
+
+validate_ascii(<<>>) -> 0;
+validate_ascii(<<C1,C2,C3,C4,R/bits>>) when C1 < 128, C2 < 128, C3 < 128, C4 < 128 -> validate_ascii(R);
+validate_ascii(<<C1,R/bits>>) when C1 < 128 -> validate_ascii(R);
+validate_ascii(Text) -> validate_s0(Text).
+
+%% Instead of switching back to ASCII we first have this
+%% function attempt to find a non-ASCII character to
+%% greatly speed up decoding of Japanese and other languages.
+validate_s0(<<C,R/bits>>) when C >= 128 ->
+	Class = element(C - 127, utf8_class()),
+	case Class of
+		2 -> validate_s2(R);
+		3 -> validate_s3(R);
+		6 -> validate_s7(R);
+		4 -> validate_s5(R);
+		5 -> validate_s8(R);
+		10 -> validate_s4(R);
+		11 -> validate_s6(R);
+		_ -> 1
+	end;
+validate_s0(Text) ->
+	validate_ascii(Text).
+
+validate_s2(<<C,R/bits>>) ->
+	Class = element(C - 127, utf8_class()),
+	case Class of
+		7 -> validate_s0(R);
+		1 -> validate_s0(R);
+		9 -> validate_s0(R);
+		_ -> 1
+	end;
+validate_s2(<<>>) ->
+	2.
+
+validate_s3(<<C,R/bits>>) ->
+	Class = element(C - 127, utf8_class()),
+	case Class of
+		7 -> validate_s2(R);
+		1 -> validate_s2(R);
+		9 -> validate_s2(R);
+		_ -> 1
+	end;
+validate_s3(<<>>) ->
+	3.
+
+validate_s4(<<C,R/bits>>) ->
+	Class = element(C - 127, utf8_class()),
+	case Class of
+		7 -> validate_s2(R);
+		_ -> 1
+	end;
+validate_s4(<<>>) ->
+	4.
+
+validate_s5(<<C,R/bits>>) ->
+	Class = element(C - 127, utf8_class()),
+	case Class of
+		1 -> validate_s2(R);
+		9 -> validate_s2(R);
+		_ -> 1
+	end;
+validate_s5(<<>>) ->
+	5.
+
+validate_s6(<<C,R/bits>>) ->
+	Class = element(C - 127, utf8_class()),
+	case Class of
+		7 -> validate_s3(R);
+		9 -> validate_s3(R);
+		_ -> 1
+	end;
+validate_s6(<<>>) ->
+	6.
+
+validate_s7(<<C,R/bits>>) ->
+	Class = element(C - 127, utf8_class()),
+	case Class of
+		7 -> validate_s3(R);
+		1 -> validate_s3(R);
+		9 -> validate_s3(R);
+		_ -> 1
+	end;
+validate_s7(<<>>) ->
+	7.
+
+validate_s8(<<C,R/bits>>) ->
+	Class = element(C - 127, utf8_class()),
+	case Class of
+		1 -> validate_s3(R);
+		_ -> 1
+	end;
+validate_s8(<<>>) ->
+	8.
+
+utf8_class() ->
+	{
+		1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+		9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+		7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+		7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+		8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+		2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+		10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3,
+		11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8
+	}.
 
 %% @doc Return a frame tuple from parsed state and data.