Skip to content

Commit

Permalink
Merge pull request #7 from sanori/dev
Browse files Browse the repository at this point in the history
Zip password with MBCS encoding
  • Loading branch information
sanori authored Jul 17, 2022
2 parents f84e77d + ddffbc1 commit f182ed3
Show file tree
Hide file tree
Showing 8 changed files with 70 additions and 25 deletions.
2 changes: 1 addition & 1 deletion LICENSE.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Copyright 2016 Joo-Won Jung
Copyright (c) 2016-2022 Joo-Won Jung

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand Down
9 changes: 6 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ optional arguments:
-h, --help show this help message and exit
-e ENCODING, --encoding ENCODING
character encoding of filename in the .zip
-p PASSWORD, --password PASSWORD
password for encrypted .zip
```


Expand All @@ -35,11 +37,12 @@ optional arguments:
Return the information of the files in zip archive `filename`
with character `encoding`

### extractZip(filename, encoding='utf-8', filters=None)
### extractZip(filename, encoding='utf-8', filters=None, password=None)
Extract files in zip archive `filename` on current directory.
Assume that the file names in zip archive are encoded as `encoding`.
Only the files prefixed the values of `filters` list are extracted
if `filters` are provided.
Use `password` on encrypted zip archive.

### fixZipFilename(filename, enc)
Fix `filename` as UNICODE string which is originally encoded as `enc`.
Expand All @@ -53,6 +56,6 @@ But, in non-ASCII, non-Western environment, it makes trouble due to filenames.
Since ZIP format was created too old (1993), there is no standard character encoding about the file name of zip archive entries.
Most of zip file entries are encoded as legacy character encoding, local charset.

In modern UNICODE based environment or global data processing environment such as Linux, this makes inconvinience, less portability, mangled file names, fail to extract the file, and so on.
In modern UNICODE based environment or global data processing environment such as Linux, this makes inconvenience, less portability, mangled file names, fail to extract the file, and so on.

This module may mitigate the inconviniences.
This module may mitigate the inconveniences.
5 changes: 2 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,12 @@ build-backend = "setuptools.build_meta"

[project]
name = "unzipmbcs"
version = "0.1.2"
version = "0.2.0"
description = "UnZip for non-UTF8 encoding such as cp949, sjis, gbk, euc-kr, euc-jp, and gb2312"
readme = "README.md"
license = { file = "LICENSE.txt" }
keywords= [
"unzip", "pkzip", "non-UTF8", "mbcs",
"cp949", "sjis", "shift_jis", "gbk", "gb18030"
"cp949", "sjis", "shift_jis", "gbk", "gb18030",
]
authors = [
{name="Joo-Won Jung", email="[email protected]"},
Expand Down
Binary file removed test/NewFolder.zip
Binary file not shown.
Binary file added test/lhaplus-zkenc.zip
Binary file not shown.
57 changes: 43 additions & 14 deletions test/test_unzipmbcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,33 +6,62 @@
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
import unzipmbcs

def setUpModule():
if (sys.getfilesystemencoding().lower() != 'utf-8') and (not os.environ.get('PYTHONIOENCODING')):
raise Exception('non-UTF8 filesystem. set PYTHONIOENCODING as your filesystem encoding!')

def clearFiles(fileList):
for entry in fileList:
if (os.path.isfile(entry)):
os.remove(entry)
elif (len(os.listdir(entry)) == 0):
os.rmdir(entry)
entry = os.path.dirname(entry)
while entry != '':
if len(os.listdir(entry)) > 0:
break;
os.rmdir(entry)
entry = os.path.dirname(entry)

class TestFromZip(unittest.TestCase):
filename = 'NewFolder.zip'
filename = 'win-default.zip'
encoding = 'cp949'
expected = [u'새 텍스트 문서.txt', u'새 폴더/', u'새 폴더/한글문서.txt']
expected = [u'똠방각하.txt', u'한글 디렉토리/새 텍스트 문서.txt']

def testListZip(self):
result = unzipmbcs.listZip(self.filename, self.encoding)
self.assertEqual(list(map(lambda x: x[0], result)), self.expected)

def testExtractZip(self):
if (sys.getfilesystemencoding() != 'UTF-8') and (not os.environ.get('PYTHONIOENCODING')):
print('Warning: non-UTF8 filesystem.',
'set PYTHONIOENCODING as your filesystem encoding!')
return
unzipmbcs.extractZip(self.filename, self.encoding)
map(lambda x: self.assertTrue(os.path.exists(x), x + ' not exist'),
self.expected)
clearFiles(self.expected)

class TestEncryptedZip(unittest.TestCase):
filename = 'lhaplus-zkenc.zip'
encoding = 'sjis'
expected = [u'ローマ字テキスト.txt', u'秘密/パスワード.txt']
password = '全角暗号'

# ListZip should work without password
def testListZip(self):
result = unzipmbcs.listZip(self.filename, self.encoding)
self.assertEqual(list(map(lambda x: x[0], result)), self.expected)

def testExtractWithoutPassword(self):
with self.assertRaises(RuntimeError):
unzipmbcs.extractZip(self.filename, self.encoding)

def testExtractWithWrongPassword(self):
with self.assertRaises(RuntimeError):
unzipmbcs.extractZip(self.filename, self.encoding, password='wrongpass')

# clean-up
files = list(self.expected) # clone the list
files.reverse()
for f in files:
if (os.path.isfile(f)):
os.remove(f)
else:
os.rmdir(f)
def testExtractWithPassword(self):
unzipmbcs.extractZip(self.filename, self.encoding, password=self.password)
map(lambda x: self.assertTrue(os.path.exists(x), x + ' not exist'),
self.expected)
clearFiles(self.expected)

if __name__ == '__main__':
unittest.main()
Binary file added test/win-default.zip
Binary file not shown.
22 changes: 18 additions & 4 deletions unzipmbcs.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#! python
######################## BEGIN LICENSE BLOCK ########################
# Copyright 2016 Joo-Won Jung
# Copyright (c) 2016-2022 Joo-Won Jung
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -47,24 +47,35 @@ def fixZipFilename(filename, enc):
raise e
return result

def transcodeBytes(str, toEnc, fromEnc='utf-8'):
if sys.version_info[0] == 2:
return str.decode(fromEnc).encode(toEnc)
return bytes(str, toEnc)

def _extractFileFromZip(z, fn, ofn):
"""
extract a file `fn` in ZipFile `z` as `ofn`
"""
f = open(ofn, 'wb')
f.write(z.read(fn))
try:
f.write(z.read(fn))
except RuntimeError as e:
f.close()
os.remove(ofn)
raise e
f.close()


def extractZip(filename, encoding='utf-8', filters=None):
def extractZip(filename, encoding='utf-8', filters=None, password=None):
"""
Extract files in zip archive `filename` on current directory.
Assume that the file names in zip archive are encoded as `encoding`.
Only the files prefixed the values of `filters` list are extracted
if `filters` are provided.
"""
z = zipfile.ZipFile(filename, 'r')
if password:
z.setpassword(transcodeBytes(password, encoding))
l = z.namelist()
for fn in l:
if len(fn) == 0 or fn[-1] == '/':
Expand Down Expand Up @@ -127,6 +138,9 @@ def _main():
parser.add_argument('-e', '--encoding',
help='character encoding of filename in the .zip',
default='utf-8')
parser.add_argument('-p', '--password',
help='password for encrypted .zip',
default=None)
parser.add_argument('zipfile', help='.zip file to unzip')
parser.add_argument('target', nargs='*',
help='file prefix to extract')
Expand All @@ -141,7 +155,7 @@ def _main():
% tuple([entry[1]] + list(entry[2][:-1]) + [entry[0]]))
elif args.cmd == 'x':
extractZip(args.zipfile, encoding=args.encoding,
filters=args.target)
filters=args.target, password=args.password)
else:
print('Unknown command:', args.cmd)

Expand Down

0 comments on commit f182ed3

Please sign in to comment.