-
Notifications
You must be signed in to change notification settings - Fork 3
/
DB.py
157 lines (150 loc) · 5.46 KB
/
DB.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import sqlite3
import json
import base64
import re
from imp import reload
#class ComplexEncoder(json.JSONEncoder):
# def default(self, obj):
# print(type(obj))
# if isinstance(obj, bytes):
# return 'BLOB DATA'
# return json.JSONEncoder.default(self, obj)
def query(obj, cur, chatid, count=100, start=0):
obj['cols'] = [
'_id',
'key_from_me',
#^ key_from_me == 1 && remote_resource != NULL ==>
# data == NULL => join/leave
# data != NULL => topic change
'data',
'timestamp',
'media_mime_type',
'media_wa_type',
#^meaning: 0 = text, 1 = image; 2 = audio'; 3 = video; 4 = contact; 5 = GPS; 8 = Wapp-call
'media_size',
'media_name',
'media_duration',
'latitude',
'longitude',
'thumb_image',
'remote_resource', #sender (for groups)
'received_timestamp',
'raw_data',
'media_caption',
]
query = "SELECT {col} FROM {tbls} WHERE {ids} AND {count} AND {nocalls}".format(
col = 'M.'+', M.'.join(obj['cols']) + ', C.subject',
tbls = 'messages AS M, chat_list AS C',
ids = 'M.key_remote_jid = C.key_remote_jid AND M.key_remote_jid = "' + chatid + '"',
count = 'M._id >= ' + str(start) + (' AND M._id < '+str(start + count) if count else ''), # if count==0 then don't set an upper bound
nocalls = 'M.media_wa_type <> 8 AND M.media_wa_type <> -1', #ignore calls and the VERY first row
)
obj['cols'].append('subject') #groupchat subject
cur.execute(query)
obj['rows'] = cur.fetchall()
def readCol(row, colname, cols):
if colname in cols:
return row[cols.index(colname)]
else:
return -1
def decodeBytes(obj):
rows = obj['rows']
for i in range(len(rows)):
blobs = []
for j in range(len(rows[i])):
if isinstance(rows[i][j], bytes):
blobs.append(j)
for j in blobs:
if j == obj['cols'].index('raw_data'):
data = base64.b64encode(rows[i][j]).decode('utf-8')
#with open('test'+str(i)+'.jpg', 'wb') as f:
# f.write(rows[i][j])
elif j == obj['cols'].index('thumb_image'):
data = rows[i][j]
if b'/WhatsApp/Media/' in data:
index = data.index(b'/WhatsApp/Media/')
data = data[index:].decode('utf-8')
else:
data = None
else:
try:
data = rows[i][j].decode('utf-8')
except UnicodeDecodeError:
print('UnicodeDecodeError. Could not decode:', rows[i][j])
data = None
rows[i] = rows[i][:j] + (data,) + rows[i][j+1:]
def readTable(src, obj, chatid):
"""Reads every single row in the provided table"""
rows = src['rows']
cols = src['cols']
obj['id'] = chatid
obj['rows'] = []
for i in range(len(rows)):
r = rows[i]
row = {}
row['timestamp'] = readCol(r, 'timestamp', cols)
row['received'] = readCol(r, 'received_timestamp', cols)
row['sender'] = readCol(r, 'remote_resource', cols)
if row['sender']: #ie. not a message you yourself sent
if row['sender'].strip() == '':
row['sender'] = readCol(r, 'key_remote_jid', cols)
if '@' in row['sender']: #trim the @s.whatsapp.com from the phone number
row['sender'] = row['sender'][:row['sender'].index('@')]
row['text'] = readCol(r, 'data', cols)
#non-chat message data
row['type'] = int(readCol(r, 'media_wa_type', cols))
#^meaning: 0 = text, 1 = image; 2 = audio'; 3 = video; 4 = contact; 5 = GPS; 8 = Wapp-call
if readCol(r, 'key_from_me', cols) == 1 and row['sender'] is not None:
if row['text'] is None:
row['type'] = -1 #this means it's a join/leave/kick
else:
row['type'] = -2 #this means it's a group-topic change
if row['type'] > 0: #not a message or any kind of status update
row['media'] = {}
media = row['media']
media['type'] = row['type'] #redundant; accessible from both `media` and its parent `row` objects
media['mimetype'] = readCol(r, 'media_mime_type', cols)
media['filepath'] = readCol(r, 'thumb_image', cols)
file_re = r'\/WhatsApp(\/Media\/.*\.\w{3})'
if media['filepath']: #No filepath in the DB
media['filepath'] = re.search(file_re, media['filepath'])
if media['filepath']: #No matches found for the above regex
media['filepath'] = '..' + media['filepath'].group(1)
media['size'] = readCol(r, 'media_size', cols)
media['name'] = readCol(r, 'media_name', cols)
media['duration'] = readCol(r, 'media_duration', cols)
media['lat-long'] = (readCol(r, 'latitude', cols), readCol(r, 'longitude', cols))
media['thumb'] = readCol(r, 'raw_data', cols)
if media['thumb']:
if media['mimetype'] is None and media['type'] == 1:
media['mimetype'] = 'image/png'
media['thumb'] = 'data:' + media['mimetype'] + ';base64,' + media['thumb']
if (row['text'] is None):
row['text'] = readCol(r, 'media_caption', cols)
else:
row['media'] = None
obj['rows'].append(row)
obj['name'] = readCol(r, 'subject', cols)
def writeToJSON(obj):
"""writes an object obj generated by readTable() to a json file"""
with open(obj['id'] + '-db.json', 'w+') as f:
f.write(json.dumps(obj))
def main(count=100, start=0):
dbconnect = sqlite3.connect('msgstore.db')
cur = dbconnect.cursor()
cur.execute('SELECT DISTINCT key_remote_jid FROM messages ORDER BY _id')
chats = cur.fetchall()
for chatid in chats:
chatid = chatid[0]
print('reading: '+chatid)
dbobj = {'cols': [], 'rows': []}
query(dbobj, cur, chatid, count=0) #fetch this chat's contents, store it in the dbobj object
if len(dbobj['rows']) == 0:
continue
decodeBytes(dbobj) #decode the bytestrings in the retulting table
chat = {} #will contain ONLY this chat, and write it to a file
readTable(dbobj, chat, chatid)
writeToJSON(chat)
print('rows: ', len(dbobj['rows']))
if len(dbobj['rows']) > 6000:
break