Skip to content

Commit

Permalink
refactor: merge some encoding and encoding names related code
Browse files Browse the repository at this point in the history
  • Loading branch information
shenlebantongying authored Nov 24, 2024
1 parent 52a9427 commit dda91a3
Show file tree
Hide file tree
Showing 11 changed files with 113 additions and 111 deletions.
2 changes: 2 additions & 0 deletions .clang-tidy
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,15 @@ Checks: >
-google-readability-casting,
-hicpp-deprecated-headers,
-hicpp-no-array-decay,
-misc-confusable-identifiers,
-misc-const-correctness,
-misc-include-cleaner,
-misc-non-private-member-variables-in-classes,
-modernize-avoid-c-arrays,
-modernize-deprecated-headers,
-modernize-use-nodiscard,
-modernize-use-trailing-return-type,
-performance-enum-size,
-readability-function-cognitive-complexity,
-readability-identifier-length,
-readability-magic-numbers,
Expand Down
6 changes: 1 addition & 5 deletions src/common/iconv.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,8 @@
#include <errno.h>
#include <string.h>

char const * const Iconv::GdWchar = "UTF-32LE";
char const * const Iconv::Utf16Le = "UTF-16LE";
char const * const Iconv::Utf8 = "UTF-8";

Iconv::Iconv( char const * from ):
state( iconv_open( Utf8, from ) )
state( iconv_open( Text::utf8, from ) )
{
if ( state == (iconv_t)-1 ) {
throw exCantInit( strerror( errno ) );
Expand Down
13 changes: 2 additions & 11 deletions src/common/iconv.hh
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,11 @@

#pragma once

#include <QString>

#include "text.hh"
#include "ex.hh"

#include "text.hh"
#include <QString>
#include <iconv.h>


/// "Internationalization conversion" for char encoding conversion, currently implemented with iconv()
/// Only supports converting from a known "from" to UTF8
class Iconv
Expand All @@ -22,12 +19,6 @@ public:
DEF_EX( Ex, "Iconv exception", std::exception )
DEF_EX_STR( exCantInit, "Can't initialize iconv conversion:", Ex )

// Some predefined character sets' names

static char const * const GdWchar;
static char const * const Utf16Le;
static char const * const Utf8;

explicit Iconv( char const * from );

~Iconv();
Expand Down
126 changes: 62 additions & 64 deletions src/common/text.cc
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,60 @@

namespace Text {

const char * getEncodingNameFor( Encoding e )
{
switch ( e ) {
case Encoding::Utf32LE:
return utf32_le;
case Encoding::Utf32BE:
return utf32_be;
case Encoding::Utf32:
return utf32;
case Encoding::Utf16LE:
return utf16_le;
case Encoding::Utf16BE:
return utf16_be;
case Encoding::Windows1252:
return windows_1252;
case Encoding::Windows1251:
return windows_1251;
case Encoding::Windows1250:
return windows_1250;
case Encoding::Utf8:
default:
return utf8;
}
}

Encoding getEncodingForName( const QByteArray & name )
{
auto const n = name.toUpper();
if ( n == utf32_le ) {
return Encoding::Utf32LE;
}
if ( n == utf32_be ) {
return Encoding::Utf32BE;
}
if ( n == utf32 ) {
return Encoding::Utf32;
}
if ( n == utf16_le ) {
return Encoding::Utf16LE;
}
if ( n == utf16_be ) {
return Encoding::Utf16BE;
}
if ( n == windows_1252 ) {
return Encoding::Windows1252;
}
if ( n == windows_1251 ) {
return Encoding::Windows1251;
}
if ( n == windows_1250 ) {
return Encoding::Windows1250;
}
return Encoding::Utf8;
}

/// Encodes the given UTF-32 into UTF-8. The inSize specifies the number
/// of wide characters the 'in' pointer points to. The 'out' buffer must be
Expand Down Expand Up @@ -200,87 +254,31 @@ int findFirstLinePosition( char * s1, int s1length, const char * s2, int s2lengt
return pos - s1 + s2length;
}

char const * getEncodingNameFor( Encoding e )
{
switch ( e ) {
case Utf32LE:
return "UTF-32LE";
case Utf32BE:
return "UTF-32BE";
case Utf16LE:
return "UTF-16LE";
case Utf16BE:
return "UTF-16BE";
case Windows1252:
return "WINDOWS-1252";
case Windows1251:
return "WINDOWS-1251";
case Utf8:
return "UTF-8";
case Windows1250:
return "WINDOWS-1250";
default:
return "UTF-8";
}
}

Encoding getEncodingForName( const QByteArray & _name )
{
const auto name = _name.toUpper();
if ( name == "UTF-32LE" ) {
return Utf32LE;
}
if ( name == "UTF-32BE" ) {
return Utf32BE;
}
if ( name == "UTF-16LE" ) {
return Utf16LE;
}
if ( name == "UTF-16BE" ) {
return Utf16BE;
}
if ( name == "WINDOWS-1252" ) {
return Windows1252;
}
if ( name == "WINDOWS-1251" ) {
return Windows1251;
}
if ( name == "UTF-8" ) {
return Utf8;
}
if ( name == "WINDOWS-1250" ) {
return Windows1250;
}
return Utf8;
}

LineFeed initLineFeed( const Encoding e )
{
LineFeed lf{};
switch ( e ) {
case Utf32LE:
case Encoding::Utf32LE:
lf.lineFeed = new char[ 4 ]{ 0x0A, 0, 0, 0 };
lf.length = 4;
break;
case Utf32BE:
case Encoding::Utf32BE:
lf.lineFeed = new char[ 4 ]{ 0, 0, 0, 0x0A };
lf.length = 4;
break;
case Utf16LE:
case Encoding::Utf16LE:
lf.lineFeed = new char[ 2 ]{ 0x0A, 0 };
lf.length = 2;
break;
case Utf16BE:
case Encoding::Utf16BE:
lf.lineFeed = new char[ 2 ]{ 0, 0x0A };
lf.length = 2;
break;
case Windows1252:

case Windows1251:

case Utf8:

case Windows1250:
case Encoding::Windows1252:
case Encoding::Windows1251:
case Encoding::Windows1250:
case Encoding::Utf8:
default:
lf.length = 1;
lf.lineFeed = new char[ 1 ]{ 0x0A };
Expand Down
28 changes: 21 additions & 7 deletions src/common/text.hh
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,44 @@
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
#pragma once

#include <cstdio>
#include "ex.hh"
#include <QByteArray>
#include <string>
#include "ex.hh"

/// Facilities to process Text, focusing on Unicode
namespace Text {
DEF_EX_STR( exCantDecode, "Can't decode the given string from Utf8:", std::exception )

// Those are possible encodings for .dsl files
enum Encoding {
Utf16LE,
/// Encoding names. Ref -> IANA's encoding names https://www.iana.org/assignments/character-sets/character-sets.xhtml
/// Notice: The ordering must not be changed before Utf32LE. The current .dsl format index file depends on it.
enum class Encoding {
Utf16LE = 0,
Utf16BE,
Windows1252,
Windows1251,
Windows1250,
Utf8,
Utf32BE,
Utf32LE,
Utf32,
};

inline constexpr auto utf16_be = "UTF-16BE";
inline constexpr auto utf16_le = "UTF-16LE";
inline constexpr auto utf32 = "UTF-32";
inline constexpr auto utf32_be = "UTF-32BE";
inline constexpr auto utf32_le = "UTF-32LE";
inline constexpr auto utf8 = "UTF-8";
inline constexpr auto windows_1250 = "WINDOWS-1250";
inline constexpr auto windows_1251 = "WINDOWS-1251";
inline constexpr auto windows_1252 = "WINDOWS-1252";

const char * getEncodingNameFor( Encoding e );
Encoding getEncodingForName( const QByteArray & name );

/// utf32 -> utf8
std::string toUtf8( std::u32string const & ) noexcept;
/// utf8 -> utf32
std::u32string toUtf32( std::string const & );

/// Since the standard isspace() is locale-specific, we need something
Expand All @@ -33,8 +49,6 @@ bool isspace( int c );

//get the first line in string s1. -1 if not found
int findFirstLinePosition( char * s1, int s1length, const char * s2, int s2length );
char const * getEncodingNameFor( Encoding e );
Encoding getEncodingForName( const QByteArray & name );

struct LineFeed
{
Expand Down
7 changes: 4 additions & 3 deletions src/dict/dsl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1144,8 +1144,9 @@ void DslDictionary::getArticleText( uint32_t articleAddress, QString & headword,
}
else {
try {
articleData =
Iconv::toWstring( getEncodingNameFor( Encoding( idxHeader.dslEncoding ) ), articleBody, articleSize );
articleData = Iconv::toWstring( getEncodingNameFor( static_cast< Encoding >( idxHeader.dslEncoding ) ),
articleBody,
articleSize );
free( articleBody );

// Strip DSL comments
Expand Down Expand Up @@ -1789,7 +1790,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
idx.write( soundDictName.data(), soundDictName.size() );
}

idxHeader.dslEncoding = scanner.getEncoding();
idxHeader.dslEncoding = static_cast< uint32_t >( scanner.getEncoding() );

IndexedWords indexedWords;

Expand Down
18 changes: 9 additions & 9 deletions src/dict/dsl_details.cc
Original file line number Diff line number Diff line change
Expand Up @@ -844,7 +844,7 @@ bool ArticleDom::atSignFirstInLine()
/////////////// DslScanner

DslScanner::DslScanner( string const & fileName ):
encoding( Text::Utf8 ),
encoding( Text::Encoding::Utf8 ),
readBufferPtr( readBuffer ),
readBufferLeft( 0 ),
linesRead( 0 )
Expand Down Expand Up @@ -875,19 +875,19 @@ DslScanner::DslScanner( string const & fileName ):
guessedEncoding.has_value() ) {
switch ( guessedEncoding.value() ) {
case QStringConverter::Utf8:
encoding = Text::Utf8;
encoding = Text::Encoding::Utf8;
break;
case QStringConverter::Utf16LE:
encoding = Text::Utf16LE;
encoding = Text::Encoding::Utf16LE;
break;
case QStringConverter::Utf16BE:
encoding = Text::Utf16BE;
encoding = Text::Encoding::Utf16BE;
break;
case QStringConverter::Utf32LE:
encoding = Text::Utf16LE;
encoding = Text::Encoding::Utf16LE;
break;
case QStringConverter::Utf32BE:
encoding = Text::Utf32BE;
encoding = Text::Encoding::Utf32BE;
break;
default:
break;
Expand Down Expand Up @@ -976,13 +976,13 @@ DslScanner::DslScanner( string const & fileName ):
qWarning( "Warning: encoding was specified in a Unicode file, ignoring." );
}
else if ( !arg.compare( U"Latin" ) ) {
encoding = Text::Windows1252;
encoding = Text::Encoding::Windows1252;
}
else if ( !arg.compare( U"Cyrillic" ) ) {
encoding = Text::Windows1251;
encoding = Text::Encoding::Windows1251;
}
else if ( !arg.compare( U"EasternEuropean" ) ) {
encoding = Text::Windows1250;
encoding = Text::Encoding::Windows1250;
}
else {
gzclose( f );
Expand Down
4 changes: 2 additions & 2 deletions src/dict/dsl_details.hh
Original file line number Diff line number Diff line change
Expand Up @@ -207,8 +207,8 @@ void stripComments( std::u32string &, bool & );
inline size_t DslScanner::distanceToBytes( size_t x ) const
{
switch ( encoding ) {
case Text::Utf16LE:
case Text::Utf16BE:
case Encoding::Utf16LE:
case Encoding::Utf16BE:
return x * 2;
default:
return x;
Expand Down
Loading

0 comments on commit dda91a3

Please sign in to comment.