Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

attempt to use encoding header for decoding #11

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 23 additions & 5 deletions src/loader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use crate::structs::{TXTSong, Source};
use std::fs::File;
use std::io::Read;
use std::path::{Path, PathBuf};
use regex::Regex;

error_chain! {
errors {
Expand Down Expand Up @@ -37,17 +38,34 @@ error_chain! {
}
}

fn read_file_to_string<P: AsRef<Path>>(p: P) -> Result<String> {
#[doc(hidden)]
pub fn read_file_to_string<P: AsRef<Path>>(p: P) -> Result<String> {
let p = p.as_ref();
let mut f = File::open(p).chain_err(|| ErrorKind::IOError)?;
let mut reader: Vec<u8> = Vec::new();
f.read_to_end(&mut reader)
.chain_err(|| ErrorKind::IOError)?;

// detect encoding and decode to String
let chardet_result = chardet::detect(&reader);
let whtwg_label = chardet::charset2encoding(&chardet_result.0);
let coder = encoding::label::encoding_from_whatwg_label(whtwg_label);
// decode as ascii and search for ENCODING Header
let test_coder = encoding::label::encoding_from_whatwg_label("ascii").unwrap();
let test_content = match test_coder.decode(&reader, encoding::DecoderTrap::Ignore) {
Ok(x) => x,
Err(e) => bail!(ErrorKind::DecodingError(e.into_owned())),
};
let mut whtwg_label = String::new();
match Regex::new(r"#ENCODING:([A-Za-z0-9\-_:.]+)\s*\n").unwrap().captures(&test_content) {
Some(cap) => {
// get encoding from header
whtwg_label.push_str(cap.get(1).unwrap().as_str());
},
None => {
// detect encoding
let chardet_result = chardet::detect(&reader);
whtwg_label.push_str(chardet::charset2encoding(&chardet_result.0));
},
};
// decode to String
let coder = encoding::label::encoding_from_whatwg_label(&whtwg_label);
let file_content = match coder {
Some(c) => match c.decode(&reader, encoding::DecoderTrap::Ignore) {
Ok(x) => x,
Expand Down
15 changes: 15 additions & 0 deletions tests/txt.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ extern crate ultrastar_txt;
use std::collections::HashMap;
use ultrastar_txt::*;
use url::Url;
use std::path::Path;

// usage:
// assert_error_kind!(some_err, ErrorKind::MyErrorType)
Expand Down Expand Up @@ -356,6 +357,20 @@ fn remote_url_audio() {
Source::Remote(Url::parse("https://www.example.com/Testfile.mp3").unwrap()));
}

#[test]
fn encoding_header_tag() {
let txt = read_file_to_string(Path::new("tests/txts/encoding_header_tag.txt")).unwrap();
let header = parse_txt_header_str(txt.as_ref()).unwrap();
assert_eq!(header.audio_path, Source::parse("petit milady - 360° Hoshi no Orchestra (TV).mp3"))
}

#[test]
fn encoding_misidentified() {
let txt = read_file_to_string(Path::new("tests/txts/encoding_misidentified.txt")).unwrap();
let header = parse_txt_header_str(txt.as_ref()).unwrap();
assert_eq!(header.audio_path, Source::parse("petit milady - 360째 Hoshi no Orchestra (TV).mp3"))
}

fn get_simple_txt_str() -> &'static str {
include_str!("txts/simple_txt_with_all_features.txt")
}
Expand Down
259 changes: 259 additions & 0 deletions tests/txts/encoding_header_tag.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,259 @@
#TITLE:360° Hoshi no Orchestra (TV)
#ARTIST:petit milady
#LANGUAGE:Japanese
#GENRE:Anime
#YEAR:2018
#CREATOR:Currymalker
#MP3:petit milady - 360° Hoshi no Orchestra (TV).mp3
#COVER:petit milady - 360° Hoshi no Orchestra (TV) [CO].jpg
#VIDEO:petit milady - 360° Hoshi no Orchestra (TV).mp4
#MEDLEYSTARTBEAT:1456
#MEDLEYENDBEAT:2011
#BPM:343.9
#GAP:513
#ENCODING:UTF8
: 0 5 11 San
: 8 4 11 byaku
: 16 4 11 roku
: 24 6 11 ju
: 32 6 9 u
: 40 4 4 do
: 48 5 9 ho
: 56 6 11 shi
: 64 7 9 no
: 73 5 11 or
: 80 5 13 ches
: 88 4 11 tra
: 94 6 13 ~
: 105 2 11 ho
: 109 6 9 ra
- 125
: 129 5 11 Fu
: 137 5 11 ta
: 144 6 13 ri
: 153 5 14 no
: 161 4 13 ko
: 169 6 6 ko
* 177 5 9 ro
: 185 2 13 ni
: 189 5 14 ~
: 196 3 13 na
: 201 6 13 ri
- 211
: 213 2 9 Hi
: 217 10 9 bi
: 229 2 11 i
: 233 9 11 ta
- 287
: 500 3 2 Yo
: 505 2 2 zo
* 509 5 4 ~
: 516 3 0 ra
: 521 7 0 ni
- 531
: 533 1 -3 U
: 536 2 0 ka
: 540 5 2 ~
: 549 2 0 n
: 553 6 0 da
- 562
: 564 2 4 Ko
: 568 9 5 do
: 580 3 -1 ku
* 585 6 -3 na
- 594
: 596 2 -3 Hi
: 600 2 5 ka
: 604 6 7 ~
: 612 3 4 ri
: 617 7 0 wo
- 630
: 632 6 0 Hi
: 641 5 2 to
: 648 6 4 wa
* 657 4 9 ho
* 664 5 7 shi
: 672 5 9 to
: 680 6 4 yo
: 688 5 0 bi
: 696 10 0 chi
: 708 2 -1 ri
: 712 10 -1 ba
: 724 2 0 me
: 728 17 2 ta
- 753
: 756 2 2 Ho
: 760 2 2 shi
: 764 5 4 ~
: 772 2 0 ta
: 777 9 0 chi
* 788 2 -3 wo
: 792 11 2 se
: 805 2 0 n
: 809 4 0 de
- 818
: 820 2 4 Mu
: 825 10 5 su
: 837 2 0 n
: 841 6 0 de
- 851
: 853 2 2 E
* 857 10 9 ga
: 869 2 2 i
: 873 6 0 ta
- 885
: 888 6 7 To
: 896 2 11 o
: 900 2 12 ~
: 905 5 11 ku
: 913 6 12 ha
: 921 5 11 na
: 929 5 7 re
: 936 7 4 te
: 945 2 7 mo
: 949 6 4 o
: 957 2 2 mo
: 961 7 2 i
- 974
: 976 2 2 To
: 980 3 4 do
: 985 5 5 ki
: 992 2 7 ma
: 997 4 2 su
: 1004 7 0 you
: 1013 3 -1 ni
: 1018 30 0 ~
- 1057
: 1060 2 7 To
: 1064 2 7 o
: 1068 2 5 ku
: 1072 2 4 wo
: 1076 2 5 mi
: 1081 9 7 tsu
: 1092 2 9 me
: 1096 8 7 te
* 1108 2 12 sa
* 1112 4 11 bishi
: 1120 6 4 sou
: 1128 8 7 ni
- 1142
: 1144 9 7 Tsu
: 1156 2 9 ku
: 1160 7 7 ru
- 1170
* 1172 2 12 E
: 1176 5 11 ga
: 1184 7 4 o
: 1193 9 2 ga
- 1206
: 1208 5 2 Ka
: 1216 5 4 ku
: 1224 6 7 shi
: 1232 5 4 ta
: 1240 5 2 na
: 1248 5 4 mi
: 1256 6 7 da
: 1264 5 9 no
: 1272 6 11 ri
: 1280 10 12 ~
* 1292 2 11 yu
* 1296 2 9 u
: 1300 9 9 wo
- 1316
: 1319 6 9 Ki
: 1327 3 4 ke
: 1332 2 7 nai
: 1336 2 9 ~
: 1340 2 7 ma
: 1344 14 7 ma
- 1368
* 1375 3 0 Na
: 1380 2 7 ga
: 1384 2 9 ~
: 1388 2 7 re
: 1392 2 9 bo
: 1397 1 7 shi
: 1400 5 9 wo
: 1408 13 7 sa
: 1424 5 9 ga
: 1432 2 11 shi
: 1436 2 9 te
: 1440 6 9 ita
- 1453
: 1456 5 11 Nan
: 1464 4 11 byaku
: 1472 6 11 kou
: 1480 5 11 nen
: 1488 6 9 ma
* 1496 6 4 e
: 1504 5 9 no
: 1512 6 11 ka
: 1520 6 9 ga
: 1528 6 11 ya
: 1536 6 13 ki
: 1544 2 11 ga
: 1548 9 13 ~
: 1560 3 11 i
: 1565 7 9 ma
- 1581
: 1584 6 11 To
: 1592 6 11 ki
: 1600 5 13 wo
: 1608 2 13 ko
: 1612 2 14 ~
: 1616 5 13 e
: 1624 5 6 te
: 1632 5 9 hi
: 1640 2 13 to
: 1644 6 14 ~
: 1652 2 13 mi
: 1656 2 11 te
: 1660 6 13 ~
: 1668 2 9 ra
: 1672 9 9 su
: 1684 3 11 ka
: 1689 4 11 ra
- 1698
: 1700 2 9 Ho
: 1705 8 4 ho wo
: 1716 2 9 tsu
: 1720 9 9 tau
: 1731 3 11 na
: 1736 10 11 mi
: 1748 2 9 da
* 1752 6 13 de
- 1762
: 1764 2 9 Se
: 1768 2 14 ka
: 1772 6 16 ~
: 1780 2 14 i
: 1784 10 14 ga
: 1796 2 11 ni
: 1800 9 13 ji
* 1812 2 16 n
* 1816 7 9 da
- 1829
: 1832 5 11 Ko
: 1840 2 9 to
* 1844 4 4 ba
: 1852 5 9 ja
: 1860 2 9 ta
: 1864 6 11 ~
: 1872 6 9 ri
: 1880 2 9 na
: 1884 2 11 ~
: 1888 5 13 i
: 1896 6 11 mo
: 1904 2 9 no
: 1908 5 11 ga
: 1916 6 13 ta
: 1924 2 13 ri
: 1928 5 14 ~
: 1936 3 13 ga
- 1942
: 1944 6 11 Ha
: 1952 5 9 ji
: 1960 2 9 ma
: 1964 6 11 ~
: 1972 39 9 ru
E
Loading