-
Notifications
You must be signed in to change notification settings - Fork 6
/
dbcexplode.py
executable file
·126 lines (103 loc) · 3.1 KB
/
dbcexplode.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/python3
"""explodes the dbc files from databricks into more useful python/sql/markdown files."""
import json
import sys
import os
def getLangPrefix(cmdstr):
prefix = cmdstr.splitlines()[0] if len(cmdstr) > 0 else ''
if len(prefix) > 0 and prefix[0] == '%':
prefix = prefix[1:]
else:
prefix = ''
return prefix
def getExtension(notebook, command):
extMap = {
'python': 'py',
'md': 'md',
'sql': 'sql',
'scala': 'scala',
}
cmdstr = command['command']
if len(cmdstr) == 0:
return
prefix = getLangPrefix(cmdstr)
ext = extMap[prefix] if prefix in extMap else None
if ext is None:
ext = extMap.get(notebook['language'])
return ext if ext is not None else ''
def outdir(inputFile):
outdir = inputFile + '-exploded'
if not (os.path.exists(outdir) and os.path.isdir(outdir)):
os.mkdir(outdir)
return outdir
def processjsonfile(filepath):
with open(filepath) as f:
try:
notebook = json.loads(f.read())
except ValueError as e:
notebook = None
pass
# ensure it is a notebook
if notebook == None or (not notebook['version'] == 'NotebookV1'):
print('SKIPPING file, ', filepath, '. Not a notebook.')
return
# prepare output dir:
dir = outdir(filepath)
print(os.path.basename(filepath), '->', os.path.basename(dir))
notebookName = notebook['name']
commands = notebook['commands']
commandNo = 0
for command in commands:
commandNo += 1
cmdstr = command['command']
if len(cmdstr) > 0:
if len(getLangPrefix(cmdstr)) > 0:
# it has a language prefix (e.g. %python ), so remove that prefix
lines = cmdstr.splitlines()
cmdstr = '\n'.join(lines[1:])
ext = getExtension(notebook, command)
path = os.path.join(dir, notebookName + str(commandNo) + '.' + ext)
with open(path, 'w') as f:
f.write(cmdstr)
def iszipfile(filepath):
with open(filepath, 'rb') as f:
bits = f.read(3)
return bits == b'PK\x03'
def processdir(filepath, deleteFileAfter=False):
for dir, dirs, files in os.walk(filepath):
for filepath in files:
fullpath=os.path.join(dir, filepath)
processjsonfile(fullpath)
if deleteFileAfter: os.remove(fullpath)
def processzipfile(filepath):
import tempfile
from tempfile import mkdtemp
from zipfile import ZipFile
destDir = tempfile.mkdtemp()
with ZipFile(filepath, 'r') as dbc:
dbc.extractall(destDir)
processdir(destDir, deleteFileAfter=True)
from shutil import move
move(destDir, filepath + '-exploded')
def main():
if len(sys.argv) != 2:
print('sys.argv', sys.argv)
print("""
Usage: dbc-explode <dbc_file>
Run with example file:
dbc-explode /path/file.dbc
""", file=sys.stderr)
exit(-1)
#load file:
filepath = os.path.abspath(sys.argv[1])
if os.path.isfile(filepath):
if iszipfile(filepath):
print("procesing as zip file")
processzipfile(filepath)
else:
print("procesing as json file")
processjsonfile(filepath)
elif os.path.isdir(filepath):
processdir(filepath)
if __name__ == "__main__":
main()