-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate_indexes.py
executable file
·186 lines (160 loc) · 7.91 KB
/
generate_indexes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#!/usr/bin/env python36
# vim: set expandtab tabstop=4 shiftwidth=4:
# Copyright (c) 2018, CJ Kucera
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the development team nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL CJ KUCERA BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import os
import re
import sys
import lzma
import json
# This script generates an index file which FT/BLCMM Explorer can then use
# to know what elements should be in its tree, rather than having to load all
# the data at start time. This means that there's technically a delay between
# the user clicking on an element and the element being drawn, though in
# practice I haven't been able to actually notice it.
#
# Note that this process is quite slow... We're reading the data line-by-line
# but still need to know the byte position within the file so that we can find
# each element, so we can't take advantage of any of Python's inherent
# optimizations when doing line-based file reads. Since this is a process
# intended to only be run by myself whenever/if the data files get updated, I
# haven't bothered to look into optimizing it. On my machine it takes a good
# twelve minutes to generate.
#
# Internally, the index is a dictionary. The keys are the data filenames (without
# paths), and the values are a list of lists, where each inner list contains the
# following elements:
#
# 1) A list defining exactly where the item should live in the tree
# (its name, basically, but exploded)
# 2) Start position (uncompressed)
# 3) Length (uncompressed)
#
# (The inner lists should more precisely be tuples, but for Reasons we're just
# using lists.)
out_file = 'index.json.xz'
min_collapse_count = 2
# Print a warning - everyone Not Me won't actually care about this.
print()
print("This utility is only useful if you've updated the resource files with")
print("new data. It will update the game index files to reflect the new")
print("contents, so it's available in the app.")
print()
print('Hit Ctrl-C now to exit, or Enter to continue...')
input()
# Generate indexes for both games.
for game in ['BL2', 'TPS']:
print('Indexing {} Game Data'.format(game))
print('----------------------')
game_dir = os.path.join('resources', game, 'dumps')
game_index = os.path.join(game_dir, out_file)
collapse_names = {}
full_collapse_names = set()
# Loop through files and build the index
index = {}
with os.scandir(game_dir) as it:
for entry in sorted(it, key=lambda e: getattr(e, 'name').lower()):
if entry.name[-8:] == '.dump.xz' or entry.name[-7:] == '.txt.xz':
print('Processing {}'.format(entry.name))
with lzma.open(entry.path, 'rt', encoding='latin1') as df:
reading_second_line = False
obj_name = None
begin_pos = df.tell()
line = df.readline()
while line:
match = re.search(r"Property dump for object '\S+ (\S+)' ", line)
if match:
if obj_name:
index[obj_name][2] = begin_pos - index[obj_name][1]
obj_name = match.group(1)
index[obj_name] = [entry.name, begin_pos, 0]
reading_second_line = True
elif reading_second_line:
reading_second_line = False
# Omit any object which doesn't have any actual data
if '=== Object properties ===' in line:
del index[obj_name]
obj_name = None
else:
# Split our object name and add it to our data list
main_parts = re.split('[:\.]', obj_name)
index[obj_name].append(main_parts)
# Grab info about our top level, for later processing to see if
# it makes sense to do extra splitting on it.
top_name = main_parts[0].lower()
full_collapse_names.add(top_name)
name_parts = top_name.rsplit('_', 1)
if len(name_parts) > 1:
if name_parts[0] not in collapse_names:
collapse_names[name_parts[0]] = set()
collapse_names[name_parts[0]].add(name_parts[1])
# Read the next line
begin_pos = df.tell()
line = df.readline()
# If we reached the end of file, be sure to 'close out' the last object
if obj_name:
index[obj_name][2] = begin_pos - index[obj_name][1]
# Filter out any top-level keys which are substrings of another key,
# or which don't have enough children
to_prune = set()
for (key, vals) in collapse_names.items():
# We need at least `min_collapse_count` items underneath us to
# qualify for collapsing
if len(vals) < min_collapse_count:
to_prune.add(key)
else:
parts = key.split('_')
for num in range(len(parts)):
# Prune any substrings
testval = '_'.join(parts[:num])
if testval in collapse_names:
to_prune.add(testval)
# Also prune any collapsing which matches on a real object name,
# so we don't have a confusing-looking tree
for key in full_collapse_names:
if key in collapse_names:
to_prune.add(key)
# Now do the pruning
for key in to_prune:
del collapse_names[key]
# Add in a pre-split Parts list to our index, split
# out additionally by `collapse_names`, if it applies.
for (name, data) in index.items():
name_parts = data[3][0].rsplit('_', 1)
if len(name_parts) > 1:
if name_parts[0].lower() in collapse_names:
data[3][:1] = ['{}_*'.format(name_parts[0]), data[3][0]]
# Transform to a dict with filenames as the key
fname_index = {}
for (filename, start_pos, length, parts) in index.values():
if filename not in fname_index:
fname_index[filename] = []
fname_index[filename].append((parts, start_pos, length))
# Write out our index
print('Writing index to {}'.format(game_index))
with lzma.open(game_index, 'wt') as df:
json.dump(fname_index, df)
print()
print('Done!')