forked from wchristian/kc-vita-translation
-
Notifications
You must be signed in to change notification settings - Fork 1
/
translate_utf8_binary.pl
264 lines (238 loc) · 10.7 KB
/
translate_utf8_binary.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
use 5.020;
use strictures 2;
use IO::All -binary;
use List::Util qw'sum uniq min';
use Encode qw' decode encode ';
use Tk;
use Tk::Font;
use utf8;
use lib '.';
use binary_translations;
run();
sub map_str_to_multi_chars {
my ( $tr, $enc, $length_target, $used, %prepared ) = @_;
my %rev_prep = reverse %prepared;
my @glyphs = sort { length $a <=> length $b } sort keys %prepared;
my @parts = split /(?<=\])|(?=\[)/, $tr;
my $e = sub { encode $enc, join "", @_ };
my $l = sub { length $e->(@_) };
return $e->(@parts) if $l->(@parts) == $length_target;
my ( %seen, @failed, $proc, %closest );
$proc = sub { # the lists made before each loop aren't faster, but easier to debug
my @parts = @_;
my $length_current = $l->(@parts);
my $need_to_shrink = $length_current - $length_target;
my @to_process = grep +( $parts[$_] !~ /^\[/ and not $rev_prep{ $parts[$_] } ), 0 .. $#parts;
for my $i (@to_process) {
my $part = $parts[$i];
my @matches = grep index( $part, $_ ) != -1, @glyphs;
@matches = grep length > ( ( $need_to_shrink > 25 ) ? 3 : ( $need_to_shrink > 15 ) ? 2 : 1 ), @matches if $need_to_shrink > 0;
for my $glyph (@matches) {
my ( $p1, $p2 ) = split /\Q$glyph\E/, $part, 2;
my @parts2 = @parts;
splice @parts2, $i, 1, grep length, $p1, $prepared{$glyph}, $p2;
my $l2 = $l->(@parts2);
my $diff = $l2 - $length_target;
return @parts2 if not $diff;
my $fail = "length in encoding $enc: $l2 : " . join "|", map +( $rev_prep{$_} ? "\$$rev_prep{$_}" : $_ ), @parts2;
push @{ $closest{ abs $diff } }, $fail;
push @failed, $fail;
next if $seen{ $e->(@parts2) };
my @deeper = $proc->(@parts2);
$seen{ $e->(@parts2) }++;
die "tried for too long, add more things to the font mod pairs" if 80_000 == keys %seen;
return @deeper if @deeper;
}
}
return;
};
my @mapped = eval { $proc->(@parts) };
my $closest_diff = min keys %closest;
push @failed, map "closest: $_", $closest{$closest_diff}->@* if $closest_diff;
@mapped = @parts if not @mapped;
$used->{ $rev_prep{$_} }++ for grep defined $rev_prep{$_}, @mapped;
return ( $e->(@mapped), uniq @failed );
}
sub map_tr_to_multi_chars {
my ( $jp, $enc, $obj, $used, %prepared ) = @_;
my $target_length = length encode $enc, $jp;
my ( $tr, @failed ) = map_str_to_multi_chars( $obj->{tr}, $enc, $target_length, $used, %prepared );
my $l_tr = length $tr;
return "length wanted: $target_length", @failed, "translation '$jp' ($target_length) => '$obj->{tr}' ($l_tr) doesn't match in length"
if $target_length != $l_tr;
$obj->{tr_mapped}{$enc} = $tr;
return;
}
sub trim_nl {
my ($s) = @_;
$s =~ s/[\r\n]//g;
return $s;
}
sub add_mapped {
my ( $dictionary, $enc, $used, %mapping ) = @_;
return map map_tr_to_multi_chars( $_, $enc, $dictionary->{$_}, $used, %mapping ), grep length $dictionary->{$_}{tr}, sort keys $dictionary->%*;
}
sub get_hits {
my ( $content, $jp, $enc ) = @_;
my @hits;
my $pos = 0;
while ( ( my $hit = index $content, encode( $enc, $jp ), $pos ) != -1 ) {
push @hits, $hit;
$pos = $hit + 1;
}
return @hits;
}
sub check_for_null_bracketing { # looks at the following char as utf8 one to make scanning easier
my ( $content, $jp, $enc, $hit ) = @_;
return if $enc eq "UTF-16LE";
my $decode_content = decode $enc, substr $content, $hit;
my $pre = ord substr $content, $hit - 1, 1;
my $post = ord substr $decode_content, length $jp, 1;
my %acceptable = map +( $_, 1 ), ( 0, 0x1, 0x4, 0x6, 0x7, 0x8, 0xB, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0xFFFD );
return ( $pre == 0 and $acceptable{$post} );
}
sub utf8_asset_files {
my $has_find = -e "c:/cygwin/bin/find.exe";
say "has find: $has_find";
my $src_dir = "../kc_original_unpack/Media/Unity_Assets_Files/";
my @list = $has_find ? split /\n/, `c:/cygwin/bin/find "$src_dir" -type f` #
: io($src_dir)->All_Files;
@list = grep !/\.(tex|dds(_\d)*|mat|gobj|shader|txt|xml|ttf|amtc|ani|avatar|cbm|flr|fsb|mesh|obj|physmat2D|rtex|script|snd|[0-9]+)$/, @list;
@list = map +( ref $_ ? $_ : io($_) ), @list;
@list = map +{ file => $_, filename => $_->filename, fileparts => [ split /\/|\\/, $_ ], enc => "UTF-8" }, @list;
$_->{fileid} = join "/", @{ $_->{fileparts} }[ 4 .. $#{ $_->{fileparts} } ] for @list;
return @list;
}
sub report_near_miss {
my ( $file_hit, $hit, $enc, $jp, $content ) = @_;
my $mod = $hit - ( $hit % 16 );
my ( $offset, $extract ) = (0);
while ( $offset < 3 ) {
$extract = decode $enc, substr $content, $hit - 16 + $offset, 28 + 2 * length encode $enc, $jp;
last if $extract =~ /\Q$jp\E/;
$offset++;
}
my ($ords) = map "[$_]", join "|", map uc sprintf( "%x", ord ), split //, $extract;
my $msg = sprintf "hit '%s' %08x %08x not marked skipped or ok, please verify %s in >%s< %s", $file_hit, $mod, $hit, $jp, $extract, $ords;
# need to remain: newlines: A D, jp space: 3000
$msg =~ s/\x{$_}/■/g
for 0 .. 8,
qw( B C E F 10 11 12 13 15 17 18 19 1A 1B 1C 1D 1E 14 600 900 300 500 B00 1D00 D00 1700 800 1500 1900 F00 700 1B00 1D00 1F00 1300 1100 2000 2100 2300 2500 2700 2900 2A00 2B00 2D00 3200 321E 3428 3C3D 3D00 3E30 3F00 4300 4900 4C30 4D00 661A 7B00 7D00 FFFD );
$msg =~ s/\r/\\r/g;
$msg =~ s/\n/\\n/g;
say $msg;
}
sub duplicate_check {
my %seen;
$seen{$_}++ for binary_translations->data;
my @duplicates = grep $seen{$_} > 1, keys %seen;
die "following keys are duplicate in dictionary: @duplicates" if @duplicates;
return;
}
sub run {
$|++;
binmode STDOUT, ":encoding(UTF-8)";
binmode STDERR, ":encoding(UTF-8)";
my $do_blank = grep /--blank/, @ARGV;
say "prepping dictionary";
my %mapping = do {
my $unicode = 0xE000;
my @pairs = grep length $_, map split( /\|/, $_ ), map trim_nl($_), io("font_mod_character_pairs")->getlines;
map +( $pairs[$_] => chr( $unicode + $_ ) ), 0 .. $#pairs;
};
duplicate_check;
my %tr = binary_translations->data;
$tr{$_}{tr} //= "" for grep !defined $tr{$_}{tr}, sort keys %tr;
my @tr_keys = reverse sort { length $a <=> length $b } sort keys %tr;
my ( $mw, $font_name ) = ( MainWindow->new, "Ume P Gothic S4" );
my $font = $mw->fontCreate( "test", -family => $font_name, -size => 18 );
my %what = $font->actual;
die "didn't create right font, but: $what{-family}" if $what{-family} ne $font_name;
for my $jp (@tr_keys) {
$tr{$jp}{width} = $font->measure($jp);
$tr{$jp}{width_tr} = $font->measure( $tr{$jp}{tr} );
$tr{$jp}{width_ratio} = sprintf "%.2f", $tr{$jp}{width_tr} / $tr{$jp}{width};
}
for my $jp ( reverse sort { $tr{$a}{width_ratio} <=> $tr{$b}{width_ratio} } sort keys %tr ) {
next if $tr{$jp}{width_ratio} <= 1;
my $msg = " $tr{$jp}{width_ratio} = $tr{$jp}{width} : $tr{$jp}{width_tr} -- $jp #-# $tr{$jp}{width_ratio} = $tr{$jp}{width} : $tr{$jp}{width_tr} -- $tr{$jp}{tr}";
$msg =~ s/\n/\\n/g;
$msg =~ s/\r/\\r/g;
$msg =~ s/#-#/\n/g;
say $msg;
}
print "\n";
my %used;
my @too_long = map add_mapped( \%tr, $_, \%used, %mapping ), "UTF-16LE", "UTF-8";
s/\n/\\n/g for @too_long;
s/\r/\\r/g for @too_long;
die join "\n", @too_long, "\n" if @too_long;
my @unused = grep !$used{$_}, keys %mapping;
say "following tuples unused: @unused\nfollowing tuples used: '" . ( join "|", sort keys %used ) . "'\n" if @unused;
if ($do_blank) {
for my $enc ( "UTF-16LE", "UTF-8" ) {
$tr{$_}{tr_mapped}{$enc} = "\0" x length encode $enc, $_ for keys %tr;
}
}
# this converts any single string ok/skip entries into arrays, or fills in empty arrays if there's none
for my $entry ( values %tr ) {
$entry->{$_} = !defined $entry->{$_} ? [] : !ref $entry->{$_} ? [ $entry->{$_} ] : $entry->{$_} for qw( ok skip );
}
say "grabbing file list";
my @list = utf8_asset_files;
push @list, { #
file => io("../kc_original/Media/Managed/Assembly-CSharp.dll"),
filename => "Assembly-CSharp.dll",
fileparts => [ split /\/|\\/, "../kc_original/Media/Managed/Assembly-CSharp.dll" ],
enc => [ "UTF-16LE", "UTF-8" ],
fileid => "a-csharp",
};
@list = sort { lc $a->{fileid} cmp lc $b->{fileid} } @list;
say "prepped";
my %found;
my %unmatched;
my %hit;
for my $file (@list) {
my $content = $file->{file}->all;
my $f_enc = $file->{enc};
my $found;
for my $enc ( ref $f_enc ? $f_enc->@* : $f_enc ) {
for my $jp (@tr_keys) {
next unless #
my @hits = get_hits $content, $jp, $enc;
my %obj = $tr{$jp}->%*;
for my $hit (@hits) {
my $file_hit = "$file->{fileid} $hit";
next if grep $file_hit eq $_, $obj{skip}->@*;
$hit{$jp}++;
if ( !grep $file_hit eq $_, $obj{ok}->@* and !check_for_null_bracketing $content, $jp, $enc, $hit ) {
$unmatched{$jp}++;
report_near_miss $file_hit, $hit, $enc, $jp, $content;
next;
}
next if !$do_blank and !length $obj{tr};
substr( $content, $hit, length $_ ) = $_ for $obj{tr_mapped}{$enc};
$found++;
$found{$jp}++;
}
}
}
my @file_parts = $file->{fileparts}->@*;
$file_parts[1] = "kc_original_unpack_modded";
my $target = join "/", @file_parts;
if ( !$found ) {
io($target)->unlink if -e $target;
next;
}
io( io->file($target)->filepath )->mkpath;
io($target)->print($content);
}
my @maybe = map sprintf( " %-" . ( 30 - length $_ ) . "s %-30s hit x %3s, nomatch x %3s, match x %3s", $_, $tr{$_}{tr}, $hit{$_}, $unmatched{$_}, $hit{$_} - $unmatched{$_} ),
reverse sort { length $a <=> length $b } sort keys %unmatched;
my @nowhere = map sprintf( " %-" . ( 30 - length $_ ) . "s $tr{$_}{tr}", $_ ), grep +( !$found{$_} and !$unmatched{$_} ), @tr_keys;
s/\n/\\n/g for @maybe, @nowhere;
s/\r/\\r/g for @maybe, @nowhere;
say join "\n", "", "strings not always identified confidently:", @maybe, "", "strings found nowhere:", @nowhere;
say "\ndone";
return;
}