From dda91a30dd495fd0bc5309ca2e88b2ddf609c990 Mon Sep 17 00:00:00 2001 From: shenleban tongying Date: Sun, 24 Nov 2024 01:29:09 -0500 Subject: [PATCH] refactor: merge some encoding and encoding names related code --- .clang-tidy | 2 + src/common/iconv.cc | 6 +- src/common/iconv.hh | 13 +---- src/common/text.cc | 126 ++++++++++++++++++++-------------------- src/common/text.hh | 28 ++++++--- src/dict/dsl.cc | 7 ++- src/dict/dsl_details.cc | 18 +++--- src/dict/dsl_details.hh | 4 +- src/dict/gls.cc | 12 ++-- src/dict/hunspell.cc | 6 +- src/dict/lsa.cc | 2 +- 11 files changed, 113 insertions(+), 111 deletions(-) diff --git a/.clang-tidy b/.clang-tidy index 1447cfea4..c60fecef0 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -26,6 +26,7 @@ Checks: > -google-readability-casting, -hicpp-deprecated-headers, -hicpp-no-array-decay, + -misc-confusable-identifiers, -misc-const-correctness, -misc-include-cleaner, -misc-non-private-member-variables-in-classes, @@ -33,6 +34,7 @@ Checks: > -modernize-deprecated-headers, -modernize-use-nodiscard, -modernize-use-trailing-return-type, + -performance-enum-size, -readability-function-cognitive-complexity, -readability-identifier-length, -readability-magic-numbers, diff --git a/src/common/iconv.cc b/src/common/iconv.cc index 6c700e9cf..bdb06c0a8 100644 --- a/src/common/iconv.cc +++ b/src/common/iconv.cc @@ -6,12 +6,8 @@ #include #include -char const * const Iconv::GdWchar = "UTF-32LE"; -char const * const Iconv::Utf16Le = "UTF-16LE"; -char const * const Iconv::Utf8 = "UTF-8"; - Iconv::Iconv( char const * from ): - state( iconv_open( Utf8, from ) ) + state( iconv_open( Text::utf8, from ) ) { if ( state == (iconv_t)-1 ) { throw exCantInit( strerror( errno ) ); diff --git a/src/common/iconv.hh b/src/common/iconv.hh index f27aaecdf..6a3cb3935 100644 --- a/src/common/iconv.hh +++ b/src/common/iconv.hh @@ -3,14 +3,11 @@ #pragma once -#include - -#include "text.hh" #include "ex.hh" - +#include "text.hh" +#include #include - /// "Internationalization conversion" for char encoding conversion, currently implemented with iconv() /// Only supports converting from a known "from" to UTF8 class Iconv @@ -22,12 +19,6 @@ public: DEF_EX( Ex, "Iconv exception", std::exception ) DEF_EX_STR( exCantInit, "Can't initialize iconv conversion:", Ex ) - // Some predefined character sets' names - - static char const * const GdWchar; - static char const * const Utf16Le; - static char const * const Utf8; - explicit Iconv( char const * from ); ~Iconv(); diff --git a/src/common/text.cc b/src/common/text.cc index 71c8038da..5f3901a7c 100644 --- a/src/common/text.cc +++ b/src/common/text.cc @@ -10,6 +10,60 @@ namespace Text { +const char * getEncodingNameFor( Encoding e ) +{ + switch ( e ) { + case Encoding::Utf32LE: + return utf32_le; + case Encoding::Utf32BE: + return utf32_be; + case Encoding::Utf32: + return utf32; + case Encoding::Utf16LE: + return utf16_le; + case Encoding::Utf16BE: + return utf16_be; + case Encoding::Windows1252: + return windows_1252; + case Encoding::Windows1251: + return windows_1251; + case Encoding::Windows1250: + return windows_1250; + case Encoding::Utf8: + default: + return utf8; + } +} + +Encoding getEncodingForName( const QByteArray & name ) +{ + auto const n = name.toUpper(); + if ( n == utf32_le ) { + return Encoding::Utf32LE; + } + if ( n == utf32_be ) { + return Encoding::Utf32BE; + } + if ( n == utf32 ) { + return Encoding::Utf32; + } + if ( n == utf16_le ) { + return Encoding::Utf16LE; + } + if ( n == utf16_be ) { + return Encoding::Utf16BE; + } + if ( n == windows_1252 ) { + return Encoding::Windows1252; + } + if ( n == windows_1251 ) { + return Encoding::Windows1251; + } + if ( n == windows_1250 ) { + return Encoding::Windows1250; + } + return Encoding::Utf8; +} /// Encodes the given UTF-32 into UTF-8. The inSize specifies the number /// of wide characters the 'in' pointer points to. The 'out' buffer must be @@ -200,87 +254,31 @@ int findFirstLinePosition( char * s1, int s1length, const char * s2, int s2lengt return pos - s1 + s2length; } -char const * getEncodingNameFor( Encoding e ) -{ - switch ( e ) { - case Utf32LE: - return "UTF-32LE"; - case Utf32BE: - return "UTF-32BE"; - case Utf16LE: - return "UTF-16LE"; - case Utf16BE: - return "UTF-16BE"; - case Windows1252: - return "WINDOWS-1252"; - case Windows1251: - return "WINDOWS-1251"; - case Utf8: - return "UTF-8"; - case Windows1250: - return "WINDOWS-1250"; - default: - return "UTF-8"; - } -} - -Encoding getEncodingForName( const QByteArray & _name ) -{ - const auto name = _name.toUpper(); - if ( name == "UTF-32LE" ) { - return Utf32LE; - } - if ( name == "UTF-32BE" ) { - return Utf32BE; - } - if ( name == "UTF-16LE" ) { - return Utf16LE; - } - if ( name == "UTF-16BE" ) { - return Utf16BE; - } - if ( name == "WINDOWS-1252" ) { - return Windows1252; - } - if ( name == "WINDOWS-1251" ) { - return Windows1251; - } - if ( name == "UTF-8" ) { - return Utf8; - } - if ( name == "WINDOWS-1250" ) { - return Windows1250; - } - return Utf8; -} LineFeed initLineFeed( const Encoding e ) { LineFeed lf{}; switch ( e ) { - case Utf32LE: + case Encoding::Utf32LE: lf.lineFeed = new char[ 4 ]{ 0x0A, 0, 0, 0 }; lf.length = 4; break; - case Utf32BE: + case Encoding::Utf32BE: lf.lineFeed = new char[ 4 ]{ 0, 0, 0, 0x0A }; lf.length = 4; break; - case Utf16LE: + case Encoding::Utf16LE: lf.lineFeed = new char[ 2 ]{ 0x0A, 0 }; lf.length = 2; break; - case Utf16BE: + case Encoding::Utf16BE: lf.lineFeed = new char[ 2 ]{ 0, 0x0A }; lf.length = 2; break; - case Windows1252: - - case Windows1251: - - case Utf8: - - case Windows1250: + case Encoding::Windows1252: + case Encoding::Windows1251: + case Encoding::Windows1250: + case Encoding::Utf8: default: lf.length = 1; lf.lineFeed = new char[ 1 ]{ 0x0A }; diff --git a/src/common/text.hh b/src/common/text.hh index f3c47d812..782ab96f1 100644 --- a/src/common/text.hh +++ b/src/common/text.hh @@ -2,18 +2,18 @@ * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */ #pragma once -#include +#include "ex.hh" #include #include -#include "ex.hh" /// Facilities to process Text, focusing on Unicode namespace Text { DEF_EX_STR( exCantDecode, "Can't decode the given string from Utf8:", std::exception ) -// Those are possible encodings for .dsl files -enum Encoding { - Utf16LE, +/// Encoding names. Ref -> IANA's encoding names https://www.iana.org/assignments/character-sets/character-sets.xhtml +/// Notice: The ordering must not be changed before Utf32LE. The current .dsl format index file depends on it. +enum class Encoding { + Utf16LE = 0, Utf16BE, Windows1252, Windows1251, @@ -21,9 +21,25 @@ enum Encoding { Utf8, Utf32BE, Utf32LE, + Utf32, }; +inline constexpr auto utf16_be = "UTF-16BE"; +inline constexpr auto utf16_le = "UTF-16LE"; +inline constexpr auto utf32 = "UTF-32"; +inline constexpr auto utf32_be = "UTF-32BE"; +inline constexpr auto utf32_le = "UTF-32LE"; +inline constexpr auto utf8 = "UTF-8"; +inline constexpr auto windows_1250 = "WINDOWS-1250"; +inline constexpr auto windows_1251 = "WINDOWS-1251"; +inline constexpr auto windows_1252 = "WINDOWS-1252"; + +const char * getEncodingNameFor( Encoding e ); +Encoding getEncodingForName( const QByteArray & name ); + +/// utf32 -> utf8 std::string toUtf8( std::u32string const & ) noexcept; +/// utf8 -> utf32 std::u32string toUtf32( std::string const & ); /// Since the standard isspace() is locale-specific, we need something @@ -33,8 +49,6 @@ bool isspace( int c ); //get the first line in string s1. -1 if not found int findFirstLinePosition( char * s1, int s1length, const char * s2, int s2length ); -char const * getEncodingNameFor( Encoding e ); -Encoding getEncodingForName( const QByteArray & name ); struct LineFeed { diff --git a/src/dict/dsl.cc b/src/dict/dsl.cc index ed8dfbe03..eb5dc9785 100644 --- a/src/dict/dsl.cc +++ b/src/dict/dsl.cc @@ -1144,8 +1144,9 @@ void DslDictionary::getArticleText( uint32_t articleAddress, QString & headword, } else { try { - articleData = - Iconv::toWstring( getEncodingNameFor( Encoding( idxHeader.dslEncoding ) ), articleBody, articleSize ); + articleData = Iconv::toWstring( getEncodingNameFor( static_cast< Encoding >( idxHeader.dslEncoding ) ), + articleBody, + articleSize ); free( articleBody ); // Strip DSL comments @@ -1789,7 +1790,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f idx.write( soundDictName.data(), soundDictName.size() ); } - idxHeader.dslEncoding = scanner.getEncoding(); + idxHeader.dslEncoding = static_cast< uint32_t >( scanner.getEncoding() ); IndexedWords indexedWords; diff --git a/src/dict/dsl_details.cc b/src/dict/dsl_details.cc index c4afbb1da..96a43e478 100644 --- a/src/dict/dsl_details.cc +++ b/src/dict/dsl_details.cc @@ -844,7 +844,7 @@ bool ArticleDom::atSignFirstInLine() /////////////// DslScanner DslScanner::DslScanner( string const & fileName ): - encoding( Text::Utf8 ), + encoding( Text::Encoding::Utf8 ), readBufferPtr( readBuffer ), readBufferLeft( 0 ), linesRead( 0 ) @@ -875,19 +875,19 @@ DslScanner::DslScanner( string const & fileName ): guessedEncoding.has_value() ) { switch ( guessedEncoding.value() ) { case QStringConverter::Utf8: - encoding = Text::Utf8; + encoding = Text::Encoding::Utf8; break; case QStringConverter::Utf16LE: - encoding = Text::Utf16LE; + encoding = Text::Encoding::Utf16LE; break; case QStringConverter::Utf16BE: - encoding = Text::Utf16BE; + encoding = Text::Encoding::Utf16BE; break; case QStringConverter::Utf32LE: - encoding = Text::Utf16LE; + encoding = Text::Encoding::Utf16LE; break; case QStringConverter::Utf32BE: - encoding = Text::Utf32BE; + encoding = Text::Encoding::Utf32BE; break; default: break; @@ -976,13 +976,13 @@ DslScanner::DslScanner( string const & fileName ): qWarning( "Warning: encoding was specified in a Unicode file, ignoring." ); } else if ( !arg.compare( U"Latin" ) ) { - encoding = Text::Windows1252; + encoding = Text::Encoding::Windows1252; } else if ( !arg.compare( U"Cyrillic" ) ) { - encoding = Text::Windows1251; + encoding = Text::Encoding::Windows1251; } else if ( !arg.compare( U"EasternEuropean" ) ) { - encoding = Text::Windows1250; + encoding = Text::Encoding::Windows1250; } else { gzclose( f ); diff --git a/src/dict/dsl_details.hh b/src/dict/dsl_details.hh index 151596611..61a5b6217 100644 --- a/src/dict/dsl_details.hh +++ b/src/dict/dsl_details.hh @@ -207,8 +207,8 @@ void stripComments( std::u32string &, bool & ); inline size_t DslScanner::distanceToBytes( size_t x ) const { switch ( encoding ) { - case Text::Utf16LE: - case Text::Utf16BE: + case Encoding::Utf16LE: + case Encoding::Utf16BE: return x * 2; default: return x; diff --git a/src/dict/gls.cc b/src/dict/gls.cc index 299ec99fa..c1ad48617 100644 --- a/src/dict/gls.cc +++ b/src/dict/gls.cc @@ -123,7 +123,7 @@ class GlsScanner }; GlsScanner::GlsScanner( string const & fileName ): - encoding( Text::Utf8 ), + encoding( Encoding::Utf8 ), readBufferPtr( readBuffer ), readBufferLeft( 0 ), linesRead( 0 ) @@ -149,10 +149,10 @@ GlsScanner::GlsScanner( string const & fileName ): // If the file begins with the dedicated Unicode marker, we just consume // it. If, on the other hand, it's not, we return the bytes back if ( firstBytes[ 0 ] == 0xFF && firstBytes[ 1 ] == 0xFE ) { - encoding = Text::Utf16LE; + encoding = Encoding::Utf16LE; } else if ( firstBytes[ 0 ] == 0xFE && firstBytes[ 1 ] == 0xFF ) { - encoding = Text::Utf16BE; + encoding = Encoding::Utf16BE; } else if ( firstBytes[ 0 ] == 0xEF && firstBytes[ 1 ] == 0xBB ) { // Looks like Utf8, read one more byte @@ -161,14 +161,14 @@ GlsScanner::GlsScanner( string const & fileName ): gzclose( f ); throw exMalformedGlsFile( fileName ); } - encoding = Text::Utf8; + encoding = Encoding::Utf8; } else { if ( gzrewind( f ) ) { gzclose( f ); throw exCantOpen( fileName ); } - encoding = Text::Utf8; + encoding = Encoding::Utf8; } codec = QTextCodec::codecForName( Text::getEncodingNameFor( encoding ) ); @@ -1259,7 +1259,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f idx.write( (uint32_t)dictionaryName.size() ); idx.write( dictionaryName.data(), dictionaryName.size() ); - idxHeader.glsEncoding = scanner.getEncoding(); + idxHeader.glsEncoding = static_cast< uint32_t >( scanner.getEncoding() ); IndexedWords indexedWords; diff --git a/src/dict/hunspell.cc b/src/dict/hunspell.cc index 049170d60..00638d65d 100644 --- a/src/dict/hunspell.cc +++ b/src/dict/hunspell.cc @@ -207,7 +207,7 @@ void HunspellArticleRequest::run() QMutexLocker _( &hunspellMutex ); - string trimmedWord_utf8 = Iconv::toUtf8( Iconv::GdWchar, trimmedWord.data(), trimmedWord.size() ); + string trimmedWord_utf8 = Iconv::toUtf8( Text::utf32, trimmedWord.data(), trimmedWord.size() ); if ( hunspell.spell( trimmedWord_utf8 ) ) { // Good word -- no spelling suggestions then. @@ -361,7 +361,7 @@ QList< std::u32string > suggest( std::u32string & word, QMutex & hunspellMutex, try { QMutexLocker _( &hunspellMutex ); - auto suggestions = hunspell.analyze( Iconv::toUtf8( Iconv::GdWchar, word.data(), word.size() ) ); + auto suggestions = hunspell.analyze( Iconv::toUtf8( Text::utf32, word.data(), word.size() ) ); if ( !suggestions.empty() ) { // There were some suggestions made for us. Make an appropriate output. @@ -464,7 +464,7 @@ void HunspellPrefixMatchRequest::run() QMutexLocker _( &hunspellMutex ); - if ( hunspell.spell( Iconv::toUtf8( Iconv::GdWchar, trimmedWord.data(), trimmedWord.size() ) ) ) { + if ( hunspell.spell( Iconv::toUtf8( Text::utf32, trimmedWord.data(), trimmedWord.size() ) ) ) { // Known word -- add it to the result QMutexLocker _( &dataMutex ); diff --git a/src/dict/lsa.cc b/src/dict/lsa.cc index 1af16bdf5..00c858c08 100644 --- a/src/dict/lsa.cc +++ b/src/dict/lsa.cc @@ -143,7 +143,7 @@ Entry::Entry( File::Index & f ) // Read the size of the recording, in samples samplesLength = f.read< uint32_t >(); - name = Iconv::toUtf8( Iconv::Utf16Le, &filenameBuffer.front(), read * sizeof( uint16_t ) ); + name = Iconv::toUtf8( Text::utf16_le, &filenameBuffer.front(), read * sizeof( uint16_t ) ); } class LsaDictionary: public BtreeIndexing::BtreeDictionary