diff --git a/.gitignore b/.gitignore
index 78fb6457..b0e5b175 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,8 @@
+__pycache__
/cache*
reports/
-__pycache__/
+workspace.xml
+usage.statistics.xml
+shelf/
+*.iml
+gradle.xml
diff --git a/.idea/NEMESYS.iml b/.idea/NEMESYS.iml
deleted file mode 100644
index fae31720..00000000
--- a/.idea/NEMESYS.iml
+++ /dev/null
@@ -1,11 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/dictionaries/stephan.xml b/.idea/dictionaries/stephan.xml
index 860636e7..a7de52ac 100644
--- a/.idea/dictionaries/stephan.xml
+++ b/.idea/dictionaries/stephan.xml
@@ -1,6 +1,8 @@
+ basesegments
+ nemere
tshark
diff --git a/.idea/nemesys.iml b/.idea/nemesys.iml
deleted file mode 100644
index e5a46f24..00000000
--- a/.idea/nemesys.iml
+++ /dev/null
@@ -1,13 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
deleted file mode 100644
index 92bf591e..00000000
--- a/.idea/workspace.xml
+++ /dev/null
@@ -1,570 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- 1490278930309
-
-
- 1490278930309
-
-
- 1495560301252
-
-
-
- 1495560301252
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- len(originalAnalyzer.message.data) % 4
- Python
- CODE_FRAGMENT
-
-
- len(originalAnalyzer.message.data) / 4
- Python
- CODE_FRAGMENT
-
-
- len(originalAnalyzer.message.data) // 4
- Python
- CODE_FRAGMENT
-
-
- len(originalAnalyzer.message.data)
- Python
- CODE_FRAGMENT
-
-
- newSegment.super()
- Python
- CODE_FRAGMENT
-
-
- newSegment.values
- Python
- CODE_FRAGMENT
-
-
- newMessage.values
- Python
- CODE_FRAGMENT
-
-
- newSegment.analyzer.values
- Python
- CODE_FRAGMENT
-
-
- list(l4msg.data)
- Python
- CODE_FRAGMENT
-
-
-
-
- fieldkey == 'gss-api'
- Python
- EXPRESSION
-
-
- embedded == 'smb_pipe'
- Python
- EXPRESSION
-
-
- embedded == 'smb.padding'
- Python
- EXPRESSION
-
-
- len(formatedData[strPointer:strPointer+2]) != 2
- Python
- EXPRESSION
-
-
- splitshift != 0
- Python
- EXPRESSION
-
-
- risingdeltas[-1][0] >= len(self.bcdeltas)
- Python
- EXPRESSION
-
-
- numpy.isnan(segPairSimi).any()
- Python
- EXPRESSION
-
-
- numpy.isnan(similarities).any()
- Python
- EXPRESSION
-
-
- numpy.nanmax(corrarray) > 1.0
- Python
- EXPRESSION
-
-
-
-
- type(self.analyzer).__name__
- Python
- EXPRESSION
-
-
- type(self.analyzer)
- Python
- EXPRESSION
-
-
- self.analyzer
- Python
- EXPRESSION
-
-
- len(newAnalyzer.message.data)
- Python
- EXPRESSION
-
-
- len(originalAnalyzer.message.data)
- Python
- EXPRESSION
-
-
-
-
\ No newline at end of file
diff --git a/README.md b/README.md
index a42eb686..0a7c26a5 100644
--- a/README.md
+++ b/README.md
@@ -58,6 +58,7 @@ This is highly experimental software and by no means guaranteed to be fit for pr
## Requirements
* Python 3
+* libpcap for pcapy: `apt-get install libpcap-dev libpq-dev`
* Install packages listed in requirements.txt: `pip install -r requirements.txt`
* This necessitates to install libpcap for pcapy: `sudo apt-get install libpcap-dev`
* Manual install of Netzob from the ["fix-layer-build" branch](git@github.com:skleber/netzob.git)
diff --git a/eval-nemetyl-messagetype.sh b/eval-nemetyl-messagetype.sh
index 75dbfa94..1c5a4a96 100755
--- a/eval-nemetyl-messagetype.sh
+++ b/eval-nemetyl-messagetype.sh
@@ -1,45 +1,107 @@
#!/bin/bash
#input=input/*-100.pcap
-input=input/*-1000.pcap
+#input=input/*-1000.pcap
#input="input/*-100.pcap input/*-1000.pcap"
#input="input/ntp_SMIA-20111010_deduped-1000.pcap input/smb_SMIA20111010-one_deduped-1000.pcap"
-sigmas="0.6 0.8 1.0 1.2"
-#sigmas="0.9"
+input="input/maxdiff-fromOrig/ntp_SMIA-20111010_maxdiff-100.pcap"
+#input="input/maxdiff-fromOrig/*-100*.pcap"
-refines="base original"
-#refines="base"
+#sigmas="0.6 0.8 1.0 1.2"
+# default
+#sigmas="0.9 1.2"
+sigmas="1.2"
+# full
+#segmenters="nemesys"
+segmenters="nemesys"
-cftnext=$(expr 1 + $(ls -d reports/nemetyl-* | sed "s/^.*nemetyl-\([0-9]*\)-.*$/\1/" | sort | tail -1))
-cftnpad=$(printf "%03d" ${cftnext})
-currcomm=$(git log -1 --format="%h")
-report=reports/nemetyl-${cftnpad}-clustering-${currcomm}
-mkdir ${report}
+# full
+#refines="none original base nemetyl"
+# Nemesys options
+# refines="original nemetyl"
+# default
+# refines="original nemetyl"
+refines="nemetyl"
+L2PROTOS="input/awdl-* input/au-* input/wlan-beacons-*"
-for fn in ${input} ; do python src/nemetyl_align-segments.py $fn -t tshark --with-plots ; done;
-for fn in ${input} ; do python src/nemetyl_align-segments.py $fn -t 4bytesfixed --with-plots ; done;
+prefix="nemetyl"
-mv reports/*.pdf ${report}/
-mv reports/*.csv ${report}/
+cftnpad="229"
+for f in reports/${prefix}-* ; do
+ if [ -e "$f" ] ; then
+ cftnext=$(expr 1 + $(ls -d reports/${prefix}-* | sed "s/^.*${prefix}-\([0-9]*\)-.*$/\1/" | sort | tail -1))
+ cftnpad=$(printf "%03d" ${cftnext})
+ fi
+ break
+done
+currcomm=$(git log -1 --format="%h")
+report=reports/${prefix}-${cftnpad}-clustering-${currcomm}
+#echo ${report}
+#exit
+mkdir ${report}
-for sig in ${sigmas} ; do
- for ref in ${refines} ; do
- for fn in ${input} ; do
- python src/nemetyl_align-segments.py ${fn} -r ${ref} -t nemesys --with-plots
- done
- mkdir ${report}/sig${sig}-${ref}
- mv reports/*.pdf ${report}/sig${sig}-${ref}/
- mv reports/*.csv ${report}/sig${sig}-${ref}/
+for fn in ${input} ; do
+ optargs="-r" # relative to IP layer
+ for proto in ${L2PROTOS} ; do
+ if [[ "${fn}" == ${proto} ]] ; then
+ # replace
+ optargs="-l 2"
+ fi
done
+ echo -e "\n\ntshark: ${fn}"
+# echo "$fn -t tshark ${optargs} --with-plots"
+# exit
+ python src/nemetyl_align-segments.py $fn -t tshark ${optargs} --with-plots
+done
+for fn in ${input} ; do
+ optargs="-r"
+ for proto in ${L2PROTOS} ; do
+ if [[ "${fn}" == ${proto} ]] ; then
+ # replace
+ optargs="-l 2"
+ fi
+ done
+ echo -e "\n\n4bytesfixed: ${fn}"
+ python src/nemetyl_align-segments.py $fn -t 4bytesfixed ${optargs} --with-plots
done
+for seg in ${segmenters} ; do
+ for sig in ${sigmas} ; do
+ for ref in ${refines} ; do
+ if [[ ${seg} == "zeros" ]] && [[ ! ${ref} =~ ^(none|PCA1|PCAmocoSF)$ ]] ; then
+ echo ${ref} not suited for zeros segmenter. Ignoring.
+ continue
+ fi
+ for fn in ${input} ; do
+ optargs="-r"
+ for proto in ${L2PROTOS} ; do
+ if [[ "${fn}" == ${proto} ]] ; then
+ # replace
+ optargs="-l 2"
+ fi
+ done
+ echo -e "\n${seg}, sigma ${sig} (${refines}): ${fn}"
+ python src/nemetyl_align-segments.py ${fn} -f ${ref} -t ${seg} -s ${sig} ${optargs} --with-plots
+ done
+ done
+ done
+done
+for fn in ${input} ; do
+ bn=$(basename -- ${fn})
+ strippedname="${bn%.*}"
+ mv reports/${strippedname}/ ${report}/
+done
+mv reports/*.csv ${report}/
+# collect the "messagetype-combined-cluster-statistics.csv" of multiple independent nemetyl-runs
+# We don't need this anymore, after the enhancement of the reportWriter module!
+# python reports/combine-nemetyl-results.py ${report}
-spd-say "Bin fertig!"
\ No newline at end of file
+spd-say "Bin fertig!"
diff --git a/eval-netzob-fms.sh b/eval-netzob-fms.sh
new file mode 100755
index 00000000..93ae6a8a
--- /dev/null
+++ b/eval-netzob-fms.sh
@@ -0,0 +1,57 @@
+#!/usr/bin/env bash
+
+#input=input/*-100.pcap
+#input=input/*-1000.pcap
+#input="input/*-100.pcap input/*-1000.pcap"
+#input=input/maxdiff-filtered/*-1000.pcap
+#input=input/maxdiff-fromOrig/*-1000.pcap
+
+input=input/maxdiff-fromOrig/ntp_SMIA-20111010_maxdiff-100.pcap
+#input="input/maxdiff-fromOrig/smb_SMIA20111010-one-rigid1_maxdiff-1000.pcap"
+
+L2PROTOS="input/awdl-* input/au-*"
+
+prefix="netzob-format"
+
+# AWDL
+numpad="206"
+for f in reports/${prefix}-* ; do
+ if [ -e "$f" ] ; then
+ numnext=$(expr 1 + $(ls -d reports/${prefix}-* | sed "s/^.*${prefix}-\([0-9]*\)-.*$/\1/" | sort | tail -1))
+ numpad=$(printf "%03d" ${numnext})
+ fi
+ break
+done
+currcomm=$(git log -1 --format="%h")
+report=reports/${prefix}-${numpad}-fms-${currcomm}
+mkdir ${report}
+
+smin=50
+
+pids=()
+for fn in ${input} ; do
+ # relative to IP layer
+ optargs="-r" # --smax 80
+ for proto in ${L2PROTOS} ; do
+ if [[ "${fn}" == ${proto} ]] ; then
+ # replace at layer 2 absolute
+ optargs="-l 2"
+ # optargs="-l 2 --smax 75"
+ fi
+ done
+# python src/netzob_fms.py --smin ${smin} ${optargs} ${fn} > "${report}/$(basename -s .pcap ${fn}).log" &
+ python src/netzob_fms.py ${optargs} ${fn} >> "${report}/$(basename -s .pcap ${fn}).log" &
+ pids+=( $! )
+done
+
+for pid in "${pids[@]}"; do
+ printf 'Waiting for %d...' "$pid"
+ wait $pid
+ echo 'done.'
+done
+
+mv reports/*clByAlign* ${report}/
+python reports/combine-nemesys-fms.py ${report}/
+
+
+spd-say "Bin fertig!"
diff --git a/eval-netzob-messagetype.sh b/eval-netzob-messagetype.sh
index 5f66449e..79cc47f2 100755
--- a/eval-netzob-messagetype.sh
+++ b/eval-netzob-messagetype.sh
@@ -1,24 +1,55 @@
#!/bin/bash
-# dhcp_SMIA2011101X_deduped-1000.pcap
-python src/netzob_messagetypes.py input/dhcp_SMIA2011101X_deduped-1000.pcap -r --smin 77 --smax 77
-python src/netzob_messagetypes.py input/dhcp_SMIA2011101X_deduped-1000.pcap -r --smin 78 --smax 78
-python src/netzob_messagetypes.py input/dhcp_SMIA2011101X_deduped-1000.pcap -r --smin 79 --smax 79
+prefix="netzob_messagetype"
-# dns_ictf2010_deduped-982-1000.pcap
-python src/netzob_messagetypes.py input/dns_ictf2010_deduped-982-1000.pcap -r --smin 49 --smax 51
+numpad="200"
+for f in reports/${prefix}-* ; do
+ if [ -e "$f" ] ; then
+ numnext=$(expr 1 + $(ls -d reports/${prefix}-* | sed "s/^.*${prefix}-\([0-9]*\)-.*$/\1/" | sort | tail -1))
+ numpad=$(printf "%03d" ${numnext})
+ fi
+ break
+done
+currcomm=$(git log -1 --format="%h")
+report=reports/${prefix}-${numpad}-${currcomm}
+mkdir ${report}
-# nbns_SMIA20111010-one_deduped-1000.pcap
-python src/netzob_messagetypes.py input/nbns_SMIA20111010-one_deduped-1000.pcap -r --smin 57 --smax 59
+## dhcp_SMIA2011101X_deduped-1000.pcap
+#python src/netzob_messagetypes.py input/dhcp_SMIA2011101X_deduped-1000.pcap -r --smin 77 --smax 77
+#python src/netzob_messagetypes.py input/dhcp_SMIA2011101X_deduped-1000.pcap -r --smin 78 --smax 78
+#python src/netzob_messagetypes.py input/dhcp_SMIA2011101X_deduped-1000.pcap -r --smin 79 --smax 79
+#
+#
+## dns_ictf2010_deduped-982-1000.pcap
+#python src/netzob_messagetypes.py input/dns_ictf2010_deduped-982-1000.pcap -r --smin 49 --smax 51
+#
+#
+## nbns_SMIA20111010-one_deduped-1000.pcap
+#python src/netzob_messagetypes.py input/nbns_SMIA20111010-one_deduped-1000.pcap -r --smin 57 --smax 59
+#
+#
+## ntp_SMIA-20111010_deduped-1000.pcap
+#python src/netzob_messagetypes.py input/ntp_SMIA-20111010_deduped-1000.pcap -r --smin 56 --smax 58
+# ntp_SMIA-20111010_deduped-100.pcap
+python src/netzob_messagetypes.py input/maxdiff-fromOrig/ntp_SMIA-20111010_maxdiff-100.pcap -r --smin 56 --smax 58
-# ntp_SMIA-20111010_deduped-1000.pcap
-python src/netzob_messagetypes.py input/ntp_SMIA-20111010_deduped-1000.pcap -r --smin 56 --smax 58
+#
+#
+## smb_SMIA20111010-one_deduped-1000.pcap
+#python src/netzob_messagetypes.py input/nbns_SMIA20111010-one_deduped-1000.pcap -r --smin 54 --smax 55
+#python src/netzob_messagetypes.py input/nbns_SMIA20111010-one_deduped-1000.pcap -r --smin 56 --smax 56
-# smb_SMIA20111010-one_deduped-1000.pcap
-python src/netzob_messagetypes.py input/nbns_SMIA20111010-one_deduped-1000.pcap -r --smin 54 --smax 55
-python src/netzob_messagetypes.py input/nbns_SMIA20111010-one_deduped-1000.pcap -r --smin 56 --smax 56
+
+
+
+
+
+mv reports/*.csv ${report}/
+mv reports/*.pdf ${report}/
+
+spd-say "Bin fertig!"
diff --git a/input/Sources.md b/input/Sources.md
index 8a159425..ad603335 100644
--- a/input/Sources.md
+++ b/input/Sources.md
@@ -122,8 +122,12 @@ with parameters:
```
* with `mergecap -F pcap -w binaryprotocols_merged_XXX.pcap INFILES`
-
-
-
-
+## Private/Own recordings
+
+* wlan monitor captures wardriving through Biberach
+* C_SEEMOO/wlan-mgt-priv.pcapng merged from C_SEEMOO/wlan-mgt by mergecap
+* from this is filtered: wlan-beacons-priv.pcapng
+ * `wlan.fc.type_subtype == 0x0008 && !_ws.expert`
+ * (very common SSIDs could be reduced by `!(wlan.ssid == "HZN241577234" || wlan.ssid == "Fritzle")` ) but we didn't
+ * `python ~/Dokumente/git.lab-vs/REUP/nemesys/src/prep_filter-maxdiff-trace.py -l2 -p100[|0|00] wlan-beacons-priv.pcapng`
diff --git a/input/dhcp_SMIA2011101X_deduped-100.pcap b/input/deduped-orig/dhcp_SMIA2011101X_deduped-100.pcap
similarity index 100%
rename from input/dhcp_SMIA2011101X_deduped-100.pcap
rename to input/deduped-orig/dhcp_SMIA2011101X_deduped-100.pcap
diff --git a/input/dhcp_SMIA2011101X_deduped-1000.pcap b/input/deduped-orig/dhcp_SMIA2011101X_deduped-1000.pcap
similarity index 100%
rename from input/dhcp_SMIA2011101X_deduped-1000.pcap
rename to input/deduped-orig/dhcp_SMIA2011101X_deduped-1000.pcap
diff --git a/input/dhcp_SMIA2011101X_deduped-10000.pcap b/input/deduped-orig/dhcp_SMIA2011101X_deduped-10000.pcap
similarity index 100%
rename from input/dhcp_SMIA2011101X_deduped-10000.pcap
rename to input/deduped-orig/dhcp_SMIA2011101X_deduped-10000.pcap
diff --git a/input/dns_ictf2010-new-deduped-100.pcap b/input/deduped-orig/dns_ictf2010-new-deduped-100.pcap
similarity index 100%
rename from input/dns_ictf2010-new-deduped-100.pcap
rename to input/deduped-orig/dns_ictf2010-new-deduped-100.pcap
diff --git a/input/dns_ictf2010-new-deduped-1000.pcap b/input/deduped-orig/dns_ictf2010-new-deduped-1000.pcap
similarity index 100%
rename from input/dns_ictf2010-new-deduped-1000.pcap
rename to input/deduped-orig/dns_ictf2010-new-deduped-1000.pcap
diff --git a/input/dns_ictf2010-new-deduped-10000.pcap b/input/deduped-orig/dns_ictf2010-new-deduped-10000.pcap
similarity index 100%
rename from input/dns_ictf2010-new-deduped-10000.pcap
rename to input/deduped-orig/dns_ictf2010-new-deduped-10000.pcap
diff --git a/input/dns_ictf2010_deduped-100.pcap b/input/deduped-orig/dns_ictf2010_deduped-100.pcap
similarity index 100%
rename from input/dns_ictf2010_deduped-100.pcap
rename to input/deduped-orig/dns_ictf2010_deduped-100.pcap
diff --git a/input/dns_ictf2010_deduped-2.pcap b/input/deduped-orig/dns_ictf2010_deduped-2.pcap
similarity index 100%
rename from input/dns_ictf2010_deduped-2.pcap
rename to input/deduped-orig/dns_ictf2010_deduped-2.pcap
diff --git a/input/dns_ictf2010_deduped-982-1000.pcap b/input/deduped-orig/dns_ictf2010_deduped-982-1000.pcap
similarity index 100%
rename from input/dns_ictf2010_deduped-982-1000.pcap
rename to input/deduped-orig/dns_ictf2010_deduped-982-1000.pcap
diff --git a/input/dns_ictf2010_deduped-9911-10000.pcap b/input/deduped-orig/dns_ictf2010_deduped-9911-10000.pcap
similarity index 100%
rename from input/dns_ictf2010_deduped-9911-10000.pcap
rename to input/deduped-orig/dns_ictf2010_deduped-9911-10000.pcap
diff --git a/input/nbns_SMIA20111010-one_deduped-100.pcap b/input/deduped-orig/nbns_SMIA20111010-one_deduped-100.pcap
similarity index 100%
rename from input/nbns_SMIA20111010-one_deduped-100.pcap
rename to input/deduped-orig/nbns_SMIA20111010-one_deduped-100.pcap
diff --git a/input/nbns_SMIA20111010-one_deduped-1000.pcap b/input/deduped-orig/nbns_SMIA20111010-one_deduped-1000.pcap
similarity index 100%
rename from input/nbns_SMIA20111010-one_deduped-1000.pcap
rename to input/deduped-orig/nbns_SMIA20111010-one_deduped-1000.pcap
diff --git a/input/nbns_SMIA20111010-one_deduped-10000.pcap b/input/deduped-orig/nbns_SMIA20111010-one_deduped-10000.pcap
similarity index 100%
rename from input/nbns_SMIA20111010-one_deduped-10000.pcap
rename to input/deduped-orig/nbns_SMIA20111010-one_deduped-10000.pcap
diff --git a/input/ntp_SMIA-20111010_deduped-100.pcap b/input/deduped-orig/ntp_SMIA-20111010_deduped-100.pcap
similarity index 100%
rename from input/ntp_SMIA-20111010_deduped-100.pcap
rename to input/deduped-orig/ntp_SMIA-20111010_deduped-100.pcap
diff --git a/input/ntp_SMIA-20111010_deduped-1000.pcap b/input/deduped-orig/ntp_SMIA-20111010_deduped-1000.pcap
similarity index 100%
rename from input/ntp_SMIA-20111010_deduped-1000.pcap
rename to input/deduped-orig/ntp_SMIA-20111010_deduped-1000.pcap
diff --git a/input/ntp_SMIA-20111010_deduped-9995-10000.pcap b/input/deduped-orig/ntp_SMIA-20111010_deduped-9995-10000.pcap
similarity index 100%
rename from input/ntp_SMIA-20111010_deduped-9995-10000.pcap
rename to input/deduped-orig/ntp_SMIA-20111010_deduped-9995-10000.pcap
diff --git a/input/smb_SMIA20111010-one_deduped-100.pcap b/input/deduped-orig/smb_SMIA20111010-one_deduped-100.pcap
similarity index 100%
rename from input/smb_SMIA20111010-one_deduped-100.pcap
rename to input/deduped-orig/smb_SMIA20111010-one_deduped-100.pcap
diff --git a/input/smb_SMIA20111010-one_deduped-1000.pcap b/input/deduped-orig/smb_SMIA20111010-one_deduped-1000.pcap
similarity index 100%
rename from input/smb_SMIA20111010-one_deduped-1000.pcap
rename to input/deduped-orig/smb_SMIA20111010-one_deduped-1000.pcap
diff --git a/input/smb_SMIA20111010-one_deduped-10000.pcap b/input/deduped-orig/smb_SMIA20111010-one_deduped-10000.pcap
similarity index 100%
rename from input/smb_SMIA20111010-one_deduped-10000.pcap
rename to input/deduped-orig/smb_SMIA20111010-one_deduped-10000.pcap
diff --git a/input/maxdiff-filtered/dhcp_SMIA2011101X_deduped-10000_maxdiff-100.pcap b/input/hide/maxdiff-filtered/dhcp_SMIA2011101X_deduped-10000_maxdiff-100.pcap
similarity index 100%
rename from input/maxdiff-filtered/dhcp_SMIA2011101X_deduped-10000_maxdiff-100.pcap
rename to input/hide/maxdiff-filtered/dhcp_SMIA2011101X_deduped-10000_maxdiff-100.pcap
diff --git a/input/maxdiff-filtered/dhcp_SMIA2011101X_deduped-10000_maxdiff-1000.pcap b/input/hide/maxdiff-filtered/dhcp_SMIA2011101X_deduped-10000_maxdiff-1000.pcap
similarity index 100%
rename from input/maxdiff-filtered/dhcp_SMIA2011101X_deduped-10000_maxdiff-1000.pcap
rename to input/hide/maxdiff-filtered/dhcp_SMIA2011101X_deduped-10000_maxdiff-1000.pcap
diff --git a/input/maxdiff-filtered/dhcp_SMIA2011101X_deduped-10000_maxdiff-1100.pcap b/input/hide/maxdiff-filtered/dhcp_SMIA2011101X_deduped-10000_maxdiff-1100.pcap
similarity index 100%
rename from input/maxdiff-filtered/dhcp_SMIA2011101X_deduped-10000_maxdiff-1100.pcap
rename to input/hide/maxdiff-filtered/dhcp_SMIA2011101X_deduped-10000_maxdiff-1100.pcap
diff --git a/input/maxdiff-filtered/dns_ictf2010-new-deduped-10000_maxdiff-100.pcap b/input/hide/maxdiff-filtered/dns_ictf2010-new-deduped-10000_maxdiff-100.pcap
similarity index 100%
rename from input/maxdiff-filtered/dns_ictf2010-new-deduped-10000_maxdiff-100.pcap
rename to input/hide/maxdiff-filtered/dns_ictf2010-new-deduped-10000_maxdiff-100.pcap
diff --git a/input/maxdiff-filtered/dns_ictf2010-new-deduped-10000_maxdiff-1000.pcap b/input/hide/maxdiff-filtered/dns_ictf2010-new-deduped-10000_maxdiff-1000.pcap
similarity index 100%
rename from input/maxdiff-filtered/dns_ictf2010-new-deduped-10000_maxdiff-1000.pcap
rename to input/hide/maxdiff-filtered/dns_ictf2010-new-deduped-10000_maxdiff-1000.pcap
diff --git a/input/maxdiff-filtered/dns_ictf2010-new-deduped-10000_maxdiff-1100.pcap b/input/hide/maxdiff-filtered/dns_ictf2010-new-deduped-10000_maxdiff-1100.pcap
similarity index 100%
rename from input/maxdiff-filtered/dns_ictf2010-new-deduped-10000_maxdiff-1100.pcap
rename to input/hide/maxdiff-filtered/dns_ictf2010-new-deduped-10000_maxdiff-1100.pcap
diff --git a/input/maxdiff-filtered/dns_ictf2010_deduped-9911_maxdiff-100.pcap b/input/hide/maxdiff-filtered/dns_ictf2010_deduped-9911_maxdiff-100.pcap
similarity index 100%
rename from input/maxdiff-filtered/dns_ictf2010_deduped-9911_maxdiff-100.pcap
rename to input/hide/maxdiff-filtered/dns_ictf2010_deduped-9911_maxdiff-100.pcap
diff --git a/input/maxdiff-filtered/dns_ictf2010_deduped-9911_maxdiff-1000.pcap b/input/hide/maxdiff-filtered/dns_ictf2010_deduped-9911_maxdiff-1000.pcap
similarity index 100%
rename from input/maxdiff-filtered/dns_ictf2010_deduped-9911_maxdiff-1000.pcap
rename to input/hide/maxdiff-filtered/dns_ictf2010_deduped-9911_maxdiff-1000.pcap
diff --git a/input/maxdiff-filtered/dns_ictf2010_deduped-9911_maxdiff-1100.pcap b/input/hide/maxdiff-filtered/dns_ictf2010_deduped-9911_maxdiff-1100.pcap
similarity index 100%
rename from input/maxdiff-filtered/dns_ictf2010_deduped-9911_maxdiff-1100.pcap
rename to input/hide/maxdiff-filtered/dns_ictf2010_deduped-9911_maxdiff-1100.pcap
diff --git a/input/maxdiff-filtered/nbns_SMIA20111010-one_deduped-10000_maxdiff-100.pcap b/input/hide/maxdiff-filtered/nbns_SMIA20111010-one_deduped-10000_maxdiff-100.pcap
similarity index 100%
rename from input/maxdiff-filtered/nbns_SMIA20111010-one_deduped-10000_maxdiff-100.pcap
rename to input/hide/maxdiff-filtered/nbns_SMIA20111010-one_deduped-10000_maxdiff-100.pcap
diff --git a/input/maxdiff-filtered/nbns_SMIA20111010-one_deduped-10000_maxdiff-1000.pcap b/input/hide/maxdiff-filtered/nbns_SMIA20111010-one_deduped-10000_maxdiff-1000.pcap
similarity index 100%
rename from input/maxdiff-filtered/nbns_SMIA20111010-one_deduped-10000_maxdiff-1000.pcap
rename to input/hide/maxdiff-filtered/nbns_SMIA20111010-one_deduped-10000_maxdiff-1000.pcap
diff --git a/input/maxdiff-filtered/nbns_SMIA20111010-one_deduped-10000_maxdiff-1100.pcap b/input/hide/maxdiff-filtered/nbns_SMIA20111010-one_deduped-10000_maxdiff-1100.pcap
similarity index 100%
rename from input/maxdiff-filtered/nbns_SMIA20111010-one_deduped-10000_maxdiff-1100.pcap
rename to input/hide/maxdiff-filtered/nbns_SMIA20111010-one_deduped-10000_maxdiff-1100.pcap
diff --git a/input/maxdiff-filtered/ntp_SMIA-20111010_deduped-9995_maxdiff-100.pcap b/input/hide/maxdiff-filtered/ntp_SMIA-20111010_deduped-9995_maxdiff-100.pcap
similarity index 100%
rename from input/maxdiff-filtered/ntp_SMIA-20111010_deduped-9995_maxdiff-100.pcap
rename to input/hide/maxdiff-filtered/ntp_SMIA-20111010_deduped-9995_maxdiff-100.pcap
diff --git a/input/maxdiff-filtered/ntp_SMIA-20111010_deduped-9995_maxdiff-1000.pcap b/input/hide/maxdiff-filtered/ntp_SMIA-20111010_deduped-9995_maxdiff-1000.pcap
similarity index 100%
rename from input/maxdiff-filtered/ntp_SMIA-20111010_deduped-9995_maxdiff-1000.pcap
rename to input/hide/maxdiff-filtered/ntp_SMIA-20111010_deduped-9995_maxdiff-1000.pcap
diff --git a/input/maxdiff-filtered/ntp_SMIA-20111010_deduped-9995_maxdiff-1100.pcap b/input/hide/maxdiff-filtered/ntp_SMIA-20111010_deduped-9995_maxdiff-1100.pcap
similarity index 100%
rename from input/maxdiff-filtered/ntp_SMIA-20111010_deduped-9995_maxdiff-1100.pcap
rename to input/hide/maxdiff-filtered/ntp_SMIA-20111010_deduped-9995_maxdiff-1100.pcap
diff --git a/input/maxdiff-filtered/smb_SMIA20111010-one_deduped-10000_maxdiff-100.pcap b/input/hide/maxdiff-filtered/smb_SMIA20111010-one_deduped-10000_maxdiff-100.pcap
similarity index 100%
rename from input/maxdiff-filtered/smb_SMIA20111010-one_deduped-10000_maxdiff-100.pcap
rename to input/hide/maxdiff-filtered/smb_SMIA20111010-one_deduped-10000_maxdiff-100.pcap
diff --git a/input/maxdiff-filtered/smb_SMIA20111010-one_deduped-10000_maxdiff-1000.pcap b/input/hide/maxdiff-filtered/smb_SMIA20111010-one_deduped-10000_maxdiff-1000.pcap
similarity index 100%
rename from input/maxdiff-filtered/smb_SMIA20111010-one_deduped-10000_maxdiff-1000.pcap
rename to input/hide/maxdiff-filtered/smb_SMIA20111010-one_deduped-10000_maxdiff-1000.pcap
diff --git a/input/maxdiff-filtered/smb_SMIA20111010-one_deduped-10000_maxdiff-1100.pcap b/input/hide/maxdiff-filtered/smb_SMIA20111010-one_deduped-10000_maxdiff-1100.pcap
similarity index 100%
rename from input/maxdiff-filtered/smb_SMIA20111010-one_deduped-10000_maxdiff-1100.pcap
rename to input/hide/maxdiff-filtered/smb_SMIA20111010-one_deduped-10000_maxdiff-1100.pcap
diff --git a/input/mindiff-filtered/dhcp_SMIA2011101X_deduped-10000_mindiff-1100.pcap b/input/hide/mindiff-filtered/dhcp_SMIA2011101X_deduped-10000_mindiff-1100.pcap
similarity index 100%
rename from input/mindiff-filtered/dhcp_SMIA2011101X_deduped-10000_mindiff-1100.pcap
rename to input/hide/mindiff-filtered/dhcp_SMIA2011101X_deduped-10000_mindiff-1100.pcap
diff --git a/input/mindiff-filtered/dns_ictf2010-new-deduped-10000_mindiff-1100.pcap b/input/hide/mindiff-filtered/dns_ictf2010-new-deduped-10000_mindiff-1100.pcap
similarity index 100%
rename from input/mindiff-filtered/dns_ictf2010-new-deduped-10000_mindiff-1100.pcap
rename to input/hide/mindiff-filtered/dns_ictf2010-new-deduped-10000_mindiff-1100.pcap
diff --git a/input/mindiff-filtered/dns_ictf2010_deduped-9911-10000_mindiff-1100.pcap b/input/hide/mindiff-filtered/dns_ictf2010_deduped-9911-10000_mindiff-1100.pcap
similarity index 100%
rename from input/mindiff-filtered/dns_ictf2010_deduped-9911-10000_mindiff-1100.pcap
rename to input/hide/mindiff-filtered/dns_ictf2010_deduped-9911-10000_mindiff-1100.pcap
diff --git a/input/mindiff-filtered/nbns_SMIA20111010-one_deduped-10000_mindiff-1100.pcap b/input/hide/mindiff-filtered/nbns_SMIA20111010-one_deduped-10000_mindiff-1100.pcap
similarity index 100%
rename from input/mindiff-filtered/nbns_SMIA20111010-one_deduped-10000_mindiff-1100.pcap
rename to input/hide/mindiff-filtered/nbns_SMIA20111010-one_deduped-10000_mindiff-1100.pcap
diff --git a/input/mindiff-filtered/ntp_SMIA-20111010_deduped-9995-10000_mindiff-1100.pcap b/input/hide/mindiff-filtered/ntp_SMIA-20111010_deduped-9995-10000_mindiff-1100.pcap
similarity index 100%
rename from input/mindiff-filtered/ntp_SMIA-20111010_deduped-9995-10000_mindiff-1100.pcap
rename to input/hide/mindiff-filtered/ntp_SMIA-20111010_deduped-9995-10000_mindiff-1100.pcap
diff --git a/input/mindiff-filtered/smb_SMIA20111010-one_deduped-10000_mindiff-1100.pcap b/input/hide/mindiff-filtered/smb_SMIA20111010-one_deduped-10000_mindiff-1100.pcap
similarity index 100%
rename from input/mindiff-filtered/smb_SMIA20111010-one_deduped-10000_mindiff-1100.pcap
rename to input/hide/mindiff-filtered/smb_SMIA20111010-one_deduped-10000_mindiff-1100.pcap
diff --git a/requirements.txt b/requirements.txt
index 43d44d1d..7a3e7ad0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,3 +12,9 @@ matplotlib
pandas
scipy
tabulate
+networkx
+gitpython
+colorhash
+
+pcapy
+xlwt
\ No newline at end of file
diff --git a/src/Contents.md b/src/Contents.md
new file mode 100644
index 00000000..27ccf1f5
--- /dev/null
+++ b/src/Contents.md
@@ -0,0 +1,38 @@
+# Script Content Overview
+
+
+
+
+## Prep/Utils
+
+### check_parse-pcap.py
+### check_pcap-info.py
+
+### prep_deduplicate-trace.py
+
+
+
+
+## Segmentation: NEMESYS, Zeros
+
+### netzob_fms.py
+baseline
+
+### Original
+WOOT 2018
+#### nemesys_field-deviation-plot.py
+#### nemesys_fms.py
+#### nemesys.py
+
+
+
+## NEMETYL
+
+### netzob_messagetypes.py
+baseline
+
+### nemetyl_align-segments.py
+
+
+
+
diff --git a/src/check_parse-pcap.py b/src/check_parse-pcap.py
index c1110437..dde2d91d 100644
--- a/src/check_parse-pcap.py
+++ b/src/check_parse-pcap.py
@@ -27,6 +27,7 @@
specimens = SpecimenLoader(args.pcapfilename, args.targetlayer, args.relativeToIP)
else:
specimens = SpecimenLoader(args.pcapfilename)
+ print('Loaded PCAP file:', specimens.pcapFileName)
pkt = list(specimens.messagePool.values())
st = time.time()
diff --git a/src/check_pcap-info.py b/src/check_pcap-info.py
index 6e4ec2a3..0ab3e159 100644
--- a/src/check_pcap-info.py
+++ b/src/check_pcap-info.py
@@ -1,7 +1,10 @@
"""
Parse PCAP, print some statistics and infos about it and open a IPython shell.
"""
-import IPython
+from itertools import chain
+from typing import List, Sequence
+
+import IPython, numpy
from argparse import ArgumentParser
from os.path import isfile, basename
from tabulate import tabulate
@@ -17,6 +20,9 @@ def countByteFrequency():
return bytefreq.most_common()
+def meanByteDiff(messages: Sequence) -> List[List[float]]:
+ return [[numpy.diff(list(msg.data)).mean()] for msg in messages]
+
if __name__ == '__main__':
parser = ArgumentParser(
@@ -42,8 +48,10 @@ def countByteFrequency():
print("Most frequent byte values:")
print(tabulate(
((hex(b), o) for b, o in countByteFrequency()[:10])
- , headers=["byte value", "occurences"]))
-
+ , headers=["byte value", "occurrences"]))
+ print("Mean difference between bytes per message:",
+ numpy.mean(list(chain.from_iterable(meanByteDiff(specimens.messagePool.keys())))))
+ # print(tabulate(meanByteDiff(specimens.messagePool.keys())))
if args.interactive:
print('Loaded PCAP in: specimens')
diff --git a/src/nemere/__init__.py b/src/nemere/__init__.py
index e69de29b..5c3ea3a0 100644
--- a/src/nemere/__init__.py
+++ b/src/nemere/__init__.py
@@ -0,0 +1,3 @@
+from nemere.utils.loader import SpecimenLoader
+from nemere.inference.segments import MessageSegment, TypedSegment
+from nemere.visualization.simplePrint import *
diff --git a/src/nemere/alignment/alignMessages.py b/src/nemere/alignment/alignMessages.py
index 7ac7722e..2d576629 100644
--- a/src/nemere/alignment/alignMessages.py
+++ b/src/nemere/alignment/alignMessages.py
@@ -1,17 +1,22 @@
import itertools
-from typing import Tuple, Dict, List, Sequence, Union
+from typing import Tuple, Dict, List, Sequence, Union, OrderedDict
+import time
import numpy
from hdbscan import HDBSCAN
from sklearn.cluster import DBSCAN
from scipy.special import comb
+from netzob.Model.Vocabulary.Messages.AbstractMessage import AbstractMessage
+from netzob.Model.Vocabulary.Messages.RawMessage import RawMessage
+
from nemere.alignment.hirschbergAlignSegments import HirschbergOnSegmentSimilarity
from nemere.inference.segmentHandler import matrixFromTpairs
from nemere.inference.segments import MessageSegment
from nemere.inference.templates import DistanceCalculator, MemmapDC
+
class SegmentedMessages(object):
def __init__(self, dc: DistanceCalculator, segmentedMessages: Sequence[Tuple[MessageSegment]]):
self._score_match = None
@@ -19,6 +24,11 @@ def __init__(self, dc: DistanceCalculator, segmentedMessages: Sequence[Tuple[Mes
self._score_gap = None
self._dc = dc
+ # For performance reasons, adjust the value domain of the segment similarity matrix to the score domain here
+ # for use in the methods (i. e., _nwScores)
+ self._segmentSimilaritiesScoreDomain = HirschbergOnSegmentSimilarity.scoreDomainSimilarityMatrix(
+ # convert dc.distanceMatrix from being a distance to a similarity measure
+ self._dc.similarityMatrix())
self._segmentedMessages = segmentedMessages # type: Sequence[Tuple[MessageSegment]]
self._similarities = self._calcSimilarityMatrix()
self._distances = self._calcDistanceMatrix(self._similarities)
@@ -44,13 +54,13 @@ def _nwScores(self):
combcount = int(comb(len(self._segmentedMessages), 2))
combcstep = combcount/100
- # convert dc.distanceMatrix from being a distance to a similarity measure
- segmentSimilarities = self._dc.similarityMatrix()
-
print("Calculate message alignment scores for {} messages and {} pairs".format(
len(self._segmentedMessages), combcount), end=' ')
import time; timeBegin = time.time()
- hirsch = HirschbergOnSegmentSimilarity(segmentSimilarities)
+ # If gaps should be adjusted from the default in the class, this needs to be done in __init__,
+ # when calling HirschbergOnSegmentSimilarity.scoreDomainSimilarityMatrix, as long as the matrix is precomputed
+ # there
+ hirsch = HirschbergOnSegmentSimilarity(self._segmentSimilaritiesScoreDomain, similaritiesScoreDomain=True)
self._score_gap = hirsch.score_gap
self._score_match = hirsch.score_match
self._score_mismatch = hirsch.score_mismatch
@@ -60,6 +70,8 @@ def _nwScores(self):
segseq1 = self._dc.segments2index(msg1)
# Needleman-Wunsch alignment score of the two messages: based on the last entry.
+ # TODO this could be parallelized to improve performance, but the contained read from the similarity matrix
+ # must not lead to copying of the matrix for each processs! see https://research.wmz.ninja/articles/2018/03/on-sharing-large-arrays-when-using-pythons-multiprocessing.html
nwscores.append((msg0, msg1, hirsch.nwScore(segseq0, segseq1)[-1]))
if c % combcstep == 0:
print(" .", end="", flush=True)
@@ -225,7 +237,9 @@ def alignMessageType(self, msgcluster: List[Tuple[MessageSegment]]):
finished in 0.00 seconds.
Calculate message similarity from alignment scores...
>>> indicesalignment, alignedsegments = sm.alignMessageType(msgcluster)
+ >>> # noinspection PyUnresolvedReferences
>>> hexclualn = [[dc.segments[s].bytes.hex() if s != -1 else None for s in m] for m in indicesalignment]
+ >>> # noinspection PyUnresolvedReferences
>>> hexalnseg = [[s.bytes.hex() if s is not None else None for s in m] for m in alignedsegments]
>>> hexalnseg == hexclualn
True
@@ -302,6 +316,160 @@ def autoconfigureDBSCAN(self):
+class TypeIdentificationByAlignment(object):
+ """
+ Message Type Idenfication as described in NEMETYL, the INFOCOM 2020 paper
+ NEMETYL: NEtwork MEssage TYpe identification by aLignment.
+
+ Similar fields are then aligned to determine a score that is used as affinity value (dissimilarities) of messages
+ for clustering. The clusters are refined by splitting and merging on heuristics.
+ """
+
+ def __init__(self, dc: DistanceCalculator, segmentedMessages: Sequence[Tuple[MessageSegment]],
+ tokenizer: str, messagePool: OrderedDict[AbstractMessage, RawMessage]):
+ """
+ Initialize the TYL instance to hold all results in its attributes.
+ clusterAlignSplitMerge() must be called to perform the inference.
+
+ :param dc:
+ :param segmentedMessages:
+ :param tokenizer:
+ :param messagePool:
+ """
+ self._dc, self._segmentedMessages = dc, segmentedMessages
+ self._tokenizer, self._messagePool = tokenizer, messagePool
+ self.sm = None # type: Union[None, SegmentedMessages]
+ self.eps = None # type: Union[None, float]
+ self.messageTupClusters = None # type: Union[None, Dict[int, List[Tuple[MessageSegment]]]]
+ self.messageObjClusters = None # type: Union[None, Dict[int, List[RawMessage]]]
+ self.labels = None # type: Union[None, numpy.ndarray]
+ self.clusterer = None # type: Union[None, DBSCAN]
+ self.alignedClusters = None # type: Union[None, Dict[int, List[Tuple[MessageSegment]]]]
+
+
+ self.dist_calc_messagesTime = None # type: Union[None, float]
+ self.cluster_params_autoconfTime = None # type: Union[None, float]
+ self.cluster_messagesTime = None # type: Union[None, float]
+ self.align_messagesTime = None # type: Union[None, float]
+
+ @property
+ def _isNemesys(self):
+ return self._tokenizer[:7] == "nemesys"
+
+ @property
+ def _isTshark(self):
+ return self._tokenizer == "tshark"
+
+ def clusterMessages(self):
+ """
+ Calculate Alignment-Score and CLUSTER messages
+ """
+ print("Calculate distance for {} messages...".format(len(self._segmentedMessages)))
+
+ self.dist_calc_messagesTime = time.time()
+ self.sm = SegmentedMessages(self._dc, self._segmentedMessages)
+ self.dist_calc_messagesTime = time.time() - self.dist_calc_messagesTime
+
+ print('Clustering messages...')
+ self.cluster_params_autoconfTime = time.time()
+ self.eps, min_samples = self.sm.autoconfigureDBSCAN()
+ self.cluster_params_autoconfTime = time.time() - self.cluster_params_autoconfTime
+ if self._isNemesys:
+ self.eps *= .8
+ self.cluster_messagesTime = time.time()
+ self.messageTupClusters, self.labels, self.clusterer = \
+ self.sm.clusterMessageTypesDBSCAN(eps=self.eps, min_samples=3)
+ # messageClusters, labels, clusterer = sm.clusterMessageTypesHDBSCAN()
+ self.cluster_messagesTime = time.time() - self.cluster_messagesTime
+
+ # clusters as label to message object mapping
+ self.messageObjClusters = {lab : [self._messagePool[element[0].message] for element in segseq]
+ for lab, segseq in self.messageTupClusters.items()}
+ # # # # # # # # # # # # # # # # # # # # # # # #
+
+ def alignClusterMembers(self):
+ """
+ ALIGN cluster members
+ """
+ assert isinstance(self.sm, SegmentedMessages) and isinstance(self.messageTupClusters, dict), \
+ "clusterMessages() must have run before alignClusterMembers()"
+
+ self.align_messagesTime = time.time()
+ self.alignedClusters = dict()
+ # alignedClustersHex = dict()
+ print("Align each cluster...")
+ for clunu, msgcluster in self.messageTupClusters.items(): # type: int, List[Tuple[MessageSegment]]
+ # TODO perform this in parallel (per future)
+ clusteralignment, alignedsegments = self.sm.alignMessageType(msgcluster)
+ self.alignedClusters[clunu] = alignedsegments
+ # alignedClustersHex[clunu] = [[s.bytes.hex() if s is not None else None for s in m] for m in alignedsegments]
+ print()
+ self.align_messagesTime = time.time() - self.align_messagesTime
+ # # # # # # # # # # # # # # # # # # # # # # # #
+
+ def splitClusters(self, **kwargs):
+ """
+ SPLIT clusters based on fields without rare values
+
+ :param kwargs: if kwargs are set, they are used to activate CVS output at RelaxedExoticClusterSplitter.
+ see nemere.alignment.clusterSplitting.ClusterSplitter.activateCVSout()
+ """
+ assert isinstance(self.alignedClusters, dict) and isinstance(self.messageTupClusters, dict) \
+ and isinstance(self.sm, SegmentedMessages), "alignClusterMembers() must have run before splitClusters()"
+
+ from nemere.alignment.clusterSplitting import RelaxedExoticClusterSplitter
+ cSplitter = RelaxedExoticClusterSplitter(6 if not self._isTshark else 3,
+ self.alignedClusters, self.messageTupClusters, self.sm)
+ if kwargs:
+ cSplitter.activateCVSout(**kwargs)
+ # IN-PLACE split of clusters in alignedClusters and messageClusters
+ cSplitter.split()
+ # update dependent vars
+ self.labels = cSplitter.labels
+ self.messageObjClusters = {lab: [self._messagePool[element[0].message] for element in segseq]
+ for lab, segseq in self.messageTupClusters.items()}
+ # # # # # # # # # # # # # # # # # # # # # # # #
+ return "split"
+
+ def mergeClusters(self):
+ """
+ Check for cluster MERGE candidates
+ """
+ assert isinstance(self.alignedClusters, dict) and isinstance(self.messageTupClusters, dict) \
+ and isinstance(self._dc, DistanceCalculator), "splitClusters() must have run before mergeClusters()"
+
+ from nemere.alignment.clusterMerging import ClusterMerger
+ print("Check for cluster merge candidates...")
+ # ClusterMerger
+ clustermerger = ClusterMerger(self.alignedClusters, self._dc, self.messageTupClusters)
+ self.messageTupClusters = clustermerger.merge(self._isNemesys)
+ self.messageObjClusters = {lab: [self._messagePool[element[0].message] for element in segseq]
+ for lab, segseq in self.messageTupClusters.items()}
+ map2label = {msg: lab for lab, msglist in self.messageObjClusters.items() for msg in msglist}
+ self.labels = numpy.array([map2label[self._messagePool[msg[0].message]] for msg in self._segmentedMessages])
+ # # # # # # # # # # # # # # # # # # # # # # # #
+ return "split+merged"
+
+ def clusterAlignSplitMerge(self):
+ """
+ This is the main function that implements all steps of the NEMETYL method.
+ Find all results in this obejct instances attributes.
+ If intermediate results are required, run clusterMessages(), alignClusterMembers(), splitClusters()
+ and mergeClusters() yourself and capture self.messageTupClusters, self.messageObjClusters, and self.labels that
+ are updated after each step..
+ """
+ # Calculate Alignment-Score and CLUSTER messages
+ self.clusterMessages()
+ # ALIGN cluster members
+ self.alignClusterMembers()
+ # SPLIT clusters based on fields without rare values
+ self.splitClusters()
+ # Check for cluster MERGE candidates
+ self.mergeClusters()
+ # TODO split clusters are internally re-aligned, but NOT merged clusters. Can this lead to an inconsistency?
+
+
+
diff --git a/src/nemere/alignment/clusterMerging.py b/src/nemere/alignment/clusterMerging.py
index 90f8427f..a4a4d016 100644
--- a/src/nemere/alignment/clusterMerging.py
+++ b/src/nemere/alignment/clusterMerging.py
@@ -117,15 +117,16 @@ def printShouldMerge(connectedClusters: Iterable[Iterable[int]], clusterStats):
class ClusterMerger(ClusterAligner):
- def __init__(self, alignedClusters: Dict[int, List], dc: DistanceCalculator):
+ def __init__(self, alignedClusters: Dict[int, List], dc: DistanceCalculator,
+ messageTuplesClusters: Dict[int, List[Tuple[MessageSegment]]]):
self.mismatchGrace = .2
super().__init__(alignedClusters, dc)
-
+ self.messageTuplesClusters = messageTuplesClusters
# # # # # # # # # # # # # # # # # # #
# An experimentation protocol for design decisions exists!
# # # # # # # # # # # # # # # # # # #
- def alignFieldClasses(self, mmg=(0, -1, 5)):
+ def _alignFieldClasses(self, mmg=(0, -1, 5)):
fclassHirsch, statDynFields, statDynValues = self.generateHirsch(mmg)
statDynValuesMap = {sdv: idx for idx, sdv in enumerate(statDynValues)}
@@ -142,8 +143,7 @@ def alignFieldClasses(self, mmg=(0, -1, 5)):
# IPython.embed()
return alignedFieldClasses
-
- def gapMerging4nemesys(self, alignedFieldClasses):
+ def _gapMerging4nemesys(self, alignedFieldClasses):
from tabulate import tabulate
import IPython
@@ -195,8 +195,8 @@ def gapMerging4nemesys(self, alignedFieldClasses):
cid = 0
while cid < len(changes):
rid, (mid, sab) = changes[cid]
- mergedValues = tuple(newFC[sab][mid].values + newFC[sab][rid].values if rid > mid \
- else newFC[sab][rid].values + newFC[sab][mid].values)
+ mergedValues = tuple(newFC[sab][mid].values) + tuple(newFC[sab][rid].values) if rid > mid \
+ else tuple(newFC[sab][rid].values) + tuple(newFC[sab][mid].values)
mergedSTA = None
for seg in self.dc.segments:
if tuple(seg.values) == mergedValues:
@@ -240,12 +240,10 @@ def gapMerging4nemesys(self, alignedFieldClasses):
alignedFieldClassesRefined[cluPair] = alignedFieldClasses[cluPair]
return alignedFieldClassesRefined
+ def _generateMatchingConditions(self, alignedFieldClasses):
+ return self._generateMatchingConditionsAlt1(alignedFieldClasses)
- def generateMatchingConditions(self, alignedFieldClasses):
- return self.generateMatchingConditionsAlt1(alignedFieldClasses)
-
-
- def generateMatchingConditionsAlt1(self, alignedFieldClasses):
+ def _generateMatchingConditionsAlt1(self, alignedFieldClasses):
"""
fixed threshold for DYN-STA mix: 0.7 > afcA.distToNearest(afcB)
@@ -278,8 +276,7 @@ def generateMatchingConditionsAlt1(self, alignedFieldClasses):
for afcA, afcB in zip(*alignedFieldClasses[(clunuA, clunuB)])
] for clunuA, clunuB in alignedFieldClasses.keys()}
-
- def generateMatchingConditionsAlt2(self, alignedFieldClasses):
+ def _generateMatchingConditionsAlt2(self, alignedFieldClasses):
"""
alternative dynamic threshold for DYN-STA mix: dist(STA, DYN.medoid) <= DYN.maxDistToMedoid()
@@ -317,9 +314,8 @@ def generateMatchingConditionsAlt2(self, alignedFieldClasses):
for afcA, afcB in zip(*alignedFieldClasses[(clunuA, clunuB)])
] for clunuA, clunuB in alignedFieldClasses.keys()}
-
@staticmethod
- def selectMatchingClusters(alignedFieldClasses, matchingConditions):
+ def _selectMatchingClusters(alignedFieldClasses, matchingConditions):
def lenAndTrue(boolist, length=2, truths=0):
return len(boolist) <= length and len([a for a in boolist if a]) > truths
@@ -394,9 +390,8 @@ def onlyMSdistMatch(clunuA, clunuB):
# or onlyMSdistMatch(clunuA, clunuB)
]
-
- def mergeClusters(self, messageClusters, clusterStats, alignedFieldClasses,
- matchingClusters, matchingConditions):
+ def _mergeClusters(self, messageClusters, clusterStats, alignedFieldClasses,
+ matchingClusters, matchingConditions):
import IPython
from tabulate import tabulate
from nemere.utils.evaluationHelpers import printClusterMergeConditions
@@ -420,9 +415,7 @@ def mergeClusters(self, messageClusters, clusterStats, alignedFieldClasses,
remDue2gapsInARow = list()
for clunuAB in matchingClusters:
for flip in (0, 1):
- globals().update(locals())
rowOfGaps = [a[flip] for a in matchingConditions[clunuAB][1:]]
- globals().update(locals())
startOfGroups = [i for i, g in enumerate(rowOfGaps) if g and i > 1 and not rowOfGaps[i - 1]]
endOfGroups = [i for i, g in enumerate(rowOfGaps) if
g and i < len(rowOfGaps) - 1 and not rowOfGaps[i + 1]]
@@ -430,7 +423,6 @@ def mergeClusters(self, messageClusters, clusterStats, alignedFieldClasses,
endOfGroups.append(startOfGroups[-1])
if len(endOfGroups) > 0 and endOfGroups[0] == 0:
startOfGroups = [0] + startOfGroups
- globals().update(locals())
# field index before and after all gap groups longer than 2
groupOfLonger = [(sog - 1, eog + 1) for sog, eog in zip(startOfGroups, endOfGroups) if sog < eog - 1]
for beforeGroup, afterGroup in groupOfLonger:
@@ -559,13 +551,14 @@ def mergeClusters(self, messageClusters, clusterStats, alignedFieldClasses,
connectedClusters = list(connected_components(chainedRemains))
# for statistics
- missedmerges = ClusterClusterer.printShouldMerge(connectedClusters, clusterStats)
- missedmergepairs = [k for k in remainingClusters if any(
- [k[0] in mc and k[1] in mc or
- k[0] in mc and k[1] in chain.from_iterable([cc for cc in connectedClusters if k[0] in cc]) or
- k[0] in chain.from_iterable([cc for cc in connectedClusters if k[1] in cc]) and k[1] in mc
- for mc in missedmerges]
- )]
+ if clusterStats is not None:
+ missedmerges = ClusterClusterer.printShouldMerge(connectedClusters, clusterStats)
+ missedmergepairs = [k for k in remainingClusters if any(
+ [k[0] in mc and k[1] in mc or
+ k[0] in mc and k[1] in chain.from_iterable([cc for cc in connectedClusters if k[0] in cc]) or
+ k[0] in chain.from_iterable([cc for cc in connectedClusters if k[1] in cc]) and k[1] in mc
+ for mc in missedmerges]
+ )]
singleClusters = {ck: ml for ck, ml in messageClusters.items() if not chainedRemains.has_node(ck)}
mergedClusters = {str(mergelist):
@@ -575,7 +568,15 @@ def mergeClusters(self, messageClusters, clusterStats, alignedFieldClasses,
return mergedClusters
-
+ def merge(self, forNemesys=False, clusterStats=None):
+ alignedFieldClasses = self._alignFieldClasses((0, -1, 5)) # TODO alt1
+ # alignedFieldClasses = clustermerger._alignFieldClasses((0, -5, 5)) # TODO alt2
+ if forNemesys:
+ alignedFieldClasses = self._gapMerging4nemesys(alignedFieldClasses)
+ matchingConditions = self._generateMatchingConditions(alignedFieldClasses)
+ matchingClusters = ClusterMerger._selectMatchingClusters(alignedFieldClasses, matchingConditions)
+ return self._mergeClusters(self.messageTuplesClusters, clusterStats, alignedFieldClasses,
+ matchingClusters, matchingConditions)
class ClusterClusterer(ClusterAligner):
@@ -585,8 +586,6 @@ def __init__(self, alignedClusters: Dict[int, List], dc: DistanceCalculator):
self.clusterOrder = [clunu for clunu in sorted(alignedClusters.keys()) if clunu != -1]
self.distances = self.calcClusterDistances()
-
-
def calcClusterDistances(self, mmg=(0, -1, 5)):
from nemere.inference.segmentHandler import matrixFromTpairs
@@ -624,14 +623,12 @@ def calcClusterDistances(self, mmg=(0, -1, 5)):
assert distanceMatrix.min() >= 0, "prevent negative values for highly mismatching messages"
return distanceMatrix
-
def neighbors(self):
neighbors = list()
for idx, dists in enumerate(self.distances): # type: int, numpy.ndarray
neighbors.append(sorted([(i, d) for i, d in enumerate(dists) if i != idx], key=lambda x: x[1]))
return neighbors
-
def autoconfigureDBSCAN(self):
"""
Auto configure the clustering parameters epsilon and minPts regarding the input data
@@ -649,7 +646,7 @@ def autoconfigureDBSCAN(self):
seconddiffMax = (0, 0, 0)
# can we omit k = 0 ?
# No - recall and even more so precision deteriorates for dns and dhcp (1000s)
- for k in range(0, ceil(log(len(neighbors) ** 2))): # first log(n^2) alt.: // 10 first 10% of k-neigbors
+ for k in range(0, ceil(log(len(neighbors) ** 2))): # first log(n^2) alt.: // 10 first 10% of k-neighbors
knearest[k] = sorted([nfori[k][1] for nfori in neighbors])
smoothknearest[k] = gaussian_filter1d(knearest[k], sigma)
# max of second difference (maximum positive curvature) as knee (this not actually the knee!)
@@ -667,7 +664,6 @@ def autoconfigureDBSCAN(self):
print("eps {:0.3f} autoconfigured from k {}".format(epsilon, k))
return epsilon, min_samples
-
def clusterMessageTypesDBSCAN(self, eps = 1.5, min_samples = 3) \
-> Tuple[Dict[int, List[int]], numpy.ndarray, DBSCAN]:
clusterer = DBSCAN(metric='precomputed', eps=eps,
@@ -677,7 +673,6 @@ def clusterMessageTypesDBSCAN(self, eps = 1.5, min_samples = 3) \
clusterClusters, labels = self._postprocessClustering(clusterer)
return clusterClusters, labels, clusterer
-
def _postprocessClustering(self, clusterer: Union[DBSCAN]) \
-> Tuple[Dict[int, List[int]], numpy.ndarray]:
clusterer.fit(self.distances)
diff --git a/src/nemere/alignment/clusterSplitting.py b/src/nemere/alignment/clusterSplitting.py
index 6a6be486..3c86308d 100644
--- a/src/nemere/alignment/clusterSplitting.py
+++ b/src/nemere/alignment/clusterSplitting.py
@@ -2,7 +2,7 @@
Module to split clusters based on fields with high frequency values.
"""
-from os.path import exists
+from os.path import exists, join
from typing import List, Union, Tuple, Dict, Hashable
import numpy, csv
from collections import Counter
@@ -10,7 +10,8 @@
from nemere.inference.segments import MessageSegment
from nemere.alignment.alignMessages import SegmentedMessages
-
+from nemere.utils.evaluationHelpers import reportFolder
+from nemere.utils.reportWriter import IndividualClusterReport
debug = True
@@ -23,9 +24,7 @@ class ClusterSplitter(object):
Class to split clusters based on fields without rare values.
It uses static constraints about what is determined to be a rare value (#__getPivotFieldIds()).
"""
-
- exoticValueStats = "reports/exotic-values-statistics.csv"
-
+ exoticValueStats = join(reportFolder, "exotic-values-statistics.csv")
def __init__(self, fieldLenThresh: int,
alignedClusters: Dict[Hashable, List[Tuple[MessageSegment]]],
@@ -54,7 +53,7 @@ def __init__(self, fieldLenThresh: int,
self.__clusterPrecisions = None
- def activateCVSout(self, runtitle: str, trace: str, clusterPrecisions: Dict[Hashable, float]):
+ def activateCVSout(self, runtitle: Union[str, Dict], trace: str, clusterPrecisions: Dict[Hashable, float]):
"""
Activate writing of exotic field statistics to CSV for evaluation.
@@ -89,6 +88,19 @@ def _writeCSVline(self, aNum: Hashable, clusterSize, exotic, valCounts4fields: D
cPrec = self.__clusterPrecisions[aNum]
print("Cluster should", "" if cPrec < 1 else "not", "be split. Precision is", cPrec)
+ headers = [
+ 'trace', 'cluster_label', 'precision', 'cluster_size', 'field',
+ 'num_vals', 'maxdiff_n', 'maxdiff_v', 'sum=n', 'mean=n',
+ 'stdev=n', 'median=n'
+ ]
+ if not isinstance(self.__runtitle, str):
+ infCols = IndividualClusterReport.inferenceColumns(self.__runtitle)
+ headers = list(infCols.keys()) + headers
+ infParams = list(infCols.values())
+ else:
+ headers = ['run_title'] + headers
+ infParams = [self.__runtitle]
+
for fidx in exotic:
scnt = sorted(valCounts4fields[fidx].values())
diffmax = (numpy.diff(scnt).argmax() + 1) if len(scnt) > 1 else "-"
@@ -96,13 +108,8 @@ def _writeCSVline(self, aNum: Hashable, clusterSize, exotic, valCounts4fields: D
with open(ClusterSplitter.exoticValueStats, 'a') as csvfile:
exoticcsv = csv.writer(csvfile) # type: csv.writer
if csvWriteHead:
- exoticcsv.writerow([
- 'run_title', 'trace', 'cluster_label', 'precision', 'cluster_size', 'field',
- 'num_vals',
- 'maxdiff_n', 'maxdiff_v', 'sum=n', 'mean=n',
- 'stdev=n', 'median=n'
- ])
- fieldParameters = [self.__runtitle, self.__trace,
+ exoticcsv.writerow(headers)
+ fieldParameters = [*infParams, self.__trace,
aNum, cPrec, clusterSize, fidx, len(scnt)]
if len(scnt) > 1:
exoticcsv.writerow([
diff --git a/src/nemere/alignment/hirschbergAlignSegments.py b/src/nemere/alignment/hirschbergAlignSegments.py
index b79273be..d8709df3 100644
--- a/src/nemere/alignment/hirschbergAlignSegments.py
+++ b/src/nemere/alignment/hirschbergAlignSegments.py
@@ -11,16 +11,23 @@ class Alignment(ABC):
SCORE_MATCH = 1 # use as factor, to multiply with the similarity matrix.
SCORE_MISMATCH = 0
- def __init__(self, similarityMatrix, score_gap=SCORE_GAP, score_mismatch=SCORE_MISMATCH, score_match=SCORE_MATCH):
+ def __init__(self, similarityMatrix, score_gap=SCORE_GAP, score_mismatch=SCORE_MISMATCH, score_match=SCORE_MATCH,
+ similaritiesScoreDomain=False):
"""
:param similarityMatrix: normalized similarity matrix (0..1) of segments
with 1 meaning identity and 0 maximum dissimilarity.
+ :param similaritiesScoreDomain: Set to True, if similarityMatrix is already streched to the range between
+ score_mismatch and score_match. For multiple alignments with the same similarityMatrix, this greatly
+ increases performance (runtime and memory).
"""
self.score_gap = score_gap
self.score_match = score_match
self.score_mismatch = score_mismatch
- self._similarities = similarityMatrix \
- * (self.score_match - self.score_mismatch) + self.score_mismatch
+ if similaritiesScoreDomain:
+ self._similarities = similarityMatrix
+ else:
+ self._similarities = type(self).scoreDomainSimilarityMatrix(similarityMatrix,
+ self.score_mismatch, self.score_match)
"""
matrix of similarities: higher values denote closer match
@@ -28,6 +35,10 @@ def __init__(self, similarityMatrix, score_gap=SCORE_GAP, score_mismatch=SCORE_M
"mismatch penalty" that reaches into negative values for bad matches needs to be determined
"""
+ @staticmethod
+ def scoreDomainSimilarityMatrix(similarityMatrix, score_mismatch=SCORE_MISMATCH, score_match=SCORE_MATCH):
+ return similarityMatrix * (score_match - score_mismatch) + score_mismatch
+
@abstractmethod
def align(self, message0: List[int], message1: List[int]):
raise NotImplementedError()
@@ -81,7 +92,7 @@ def align(self, message0: List[int], message1: List[int]):
messageA.append(x)
messageB.append(-1) # gap
elif len(message0) == 1 or len(message1) == 1:
- nwalign = NWonSegmentSimilarity(self._similarities)
+ nwalign = NWonSegmentSimilarity(self._similarities, similaritiesScoreDomain=True)
haligned = nwalign.align(message0, message1)
# print("NW:")
# print(tabulate(haligned))
@@ -148,6 +159,8 @@ def nwScore(self, tokensX: List[int], tokensY: List[int]) -> numpy.ndarray:
for x in range(1, len(tokensX)+1): # init array
score[1,0] = score[0,0] + self.score_gap
for y in range(1, len(tokensY)+1):
+ # TODO if we optimize this some time, we must not copy the self._similarities matrix for each process!
+ # see https://research.wmz.ninja/articles/2018/03/on-sharing-large-arrays-when-using-pythons-multiprocessing.html
scoreSub = score[0,y-1] + self._similarities[tokensX[x-1], tokensY[y-1]]
scoreDel = score[0,y] + self.score_gap
scoreIns = score[1,y-1] + self.score_gap
diff --git a/src/nemere/inference/analyzers.py b/src/nemere/inference/analyzers.py
index 92ea9766..07ef7dd8 100644
--- a/src/nemere/inference/analyzers.py
+++ b/src/nemere/inference/analyzers.py
@@ -77,7 +77,7 @@ def bitCongruenceBetweenTokens(tokenlist: Union[List, bytes]):
"""
Bitwise congruence: Simple Matching [Sokal & Michener]
- not unit-dependant, token-dependent: always compares tokenwise
+ not unit-dependent, token-dependent: always compares tokenwise
:param tokenlist: list of tokens between which the bit congruence is calculated
:return: list of congruences from index i = 1 to n between bits of i-1 and i
@@ -245,7 +245,7 @@ def setAnalysisParams(self, sigma=1.5):
def analyze(self):
from collections import Sequence
if not self._analysisArgs or not isinstance(self._analysisArgs, Sequence):
- raise ParametersNotSet('Analysis parameter missing: horizon and sigma.')
+ raise ParametersNotSet('Analysis parameter missing: sigma.')
sigma, = self._analysisArgs
super().analyze()
self._bcdvalues = self._values
@@ -308,7 +308,6 @@ def messageSegmentation(self) -> List[MessageSegment]:
segments.append(MessageSegment(self, cutCurr, cutNext-cutCurr))
return segments
-
def extrema(self) -> List[Tuple[int, bool]]:
"""
:return: all extrema of the smoothed bcd, each described by a tuple of its index and bool (min is False)
@@ -1002,7 +1001,7 @@ class Autocorrelation(MessageAnalyzer):
"""
def __init__(self, message: AbstractMessage, unit=MessageAnalyzer.U_BYTE):
super().__init__(message, unit)
- self._am = None # type: MessageAnalyzer
+ self._am = None # type: Union[MessageAnalyzer, None]
@property
def domain(self):
diff --git a/src/nemere/inference/formatRefinement.py b/src/nemere/inference/formatRefinement.py
index 35a0ee3e..19391aca 100644
--- a/src/nemere/inference/formatRefinement.py
+++ b/src/nemere/inference/formatRefinement.py
@@ -1,7 +1,7 @@
from abc import ABC, abstractmethod
from typing import List
-from inference.segments import MessageSegment
+from nemere.inference.segments import MessageSegment
def isPrintableChar(char: int):
@@ -24,21 +24,50 @@ def isPrintable(bstring: bytes) -> bool:
return False
return True
-
-def locateNonPrintable(bstring: bytes) -> List[int]:
+def isOverlapping(segA: MessageSegment, segB: MessageSegment) -> bool:
"""
- A bit broader definition of printable than python string's isPrintable()
-
- :param bstring: a string of bytes
- :return: position of bytes not in \t, \n, \r or between >= 0x20 and <= 0x7e
+ Determines whether the given segmentS overlap.
+
+ >>> from nemere.inference.formatRefinement import isOverlapping
+ >>> from nemere.inference.segments import MessageSegment
+ >>> from nemere.inference.analyzers import Value
+ >>> from netzob.Model.Vocabulary.Messages.RawMessage import RawMessage
+ >>> from itertools import combinations
+ >>>
+ >>> dummymsg = RawMessage(bytes(list(range(20, 40))))
+ >>> dummyana = Value(dummymsg)
+ >>> nonoverlapping = [ MessageSegment(dummyana, 0, 2), MessageSegment(dummyana, 5, 3),
+ ... MessageSegment(dummyana, 8, 6), MessageSegment(dummyana, 17, 2) ]
+ >>> overlapping1 = [ MessageSegment(dummyana, 0, 2), MessageSegment(dummyana, 1, 3) ]
+ >>> overlapping2 = [ MessageSegment(dummyana, 7, 6), MessageSegment(dummyana, 5, 6) ]
+ >>> noncomb = combinations(nonoverlapping, 2)
+ >>> for nc in noncomb:
+ ... print(isOverlapping(*nc))
+ False
+ False
+ False
+ False
+ False
+ False
+ >>> print(isOverlapping(*overlapping1))
+ True
+ >>> print(isOverlapping(*overlapping2))
+ True
+ >>> print(isOverlapping(*reversed(overlapping1)))
+ True
+ >>> print(isOverlapping(*reversed(overlapping2)))
+ True
+
+ :param segA: The segment to check against.
+ :param segB: The segment to check against.
+ :return: Is overlapping or not.
"""
- npr = list()
- for idx, bchar in enumerate(bstring):
- if isPrintableChar(bchar):
- continue
- else:
- npr.append(idx)
- return npr
+ if segA.message == segB.message \
+ and (segA.offset < segB.nextOffset
+ and segB.offset < segA.nextOffset):
+ return True
+ else:
+ return False
class MessageModifier(ABC):
@@ -497,9 +526,9 @@ def merge(self):
"""
Perform the merging.
- >>> from utils.loader import SpecimenLoader
- >>> from inference.segmentHandler import bcDeltaGaussMessageSegmentation
- >>> from inference.formatRefinement import CumulativeCharMerger
+ >>> from nemere.utils.loader import SpecimenLoader
+ >>> from nemere.inference.segmentHandler import bcDeltaGaussMessageSegmentation
+ >>> from nemere.inference.formatRefinement import CumulativeCharMerger
>>> sl = SpecimenLoader('../input/dns_ictf2010_deduped-100.pcap', layer=0, relativeToIP=True)
>>> segmentsPerMsg = bcDeltaGaussMessageSegmentation(sl)
Segmentation by inflections of sigma-0.6-gauss-filtered bit-variance.
@@ -514,7 +543,7 @@ def merge(self):
:return: a new set of segments after the input has been merged
"""
- from inference.segmentHandler import isExtendedCharSeq
+ from nemere.inference.segmentHandler import isExtendedCharSeq
minLen = 6
@@ -592,4 +621,3 @@ def split(self, segmentID: int, chunkLength: int):
-
diff --git a/src/nemere/inference/segmentHandler.py b/src/nemere/inference/segmentHandler.py
index 61d1bf87..93314bdd 100644
--- a/src/nemere/inference/segmentHandler.py
+++ b/src/nemere/inference/segmentHandler.py
@@ -1,16 +1,16 @@
"""
Batch handling of multiple segments.
"""
-
import numpy
import copy
from typing import List, Dict, Tuple, Union, Sequence, TypeVar, Iterable
from netzob.Model.Vocabulary.Symbol import Symbol, Field
+from nemere.utils.loader import BaseLoader
from nemere.inference.segments import MessageSegment, HelperSegment, TypedSegment, AbstractSegment
from nemere.inference.analyzers import MessageAnalyzer
-from nemere.inference.templates import AbstractClusterer, TypedTemplate, DistanceCalculator, DelegatingDC
+from nemere.inference.templates import TypedTemplate
def segmentMeans(segmentsPerMsg: List[List[MessageSegment]]):
@@ -49,12 +49,60 @@ def symbolsFromSegments(segmentsPerMsg: Iterable[Sequence[MessageSegment]]) -> L
"""
Generate a list of Netzob Symbols from the given lists of segments for each message.
+ >>> from nemere.inference.segmentHandler import symbolsFromSegments
+ >>> from nemere.inference.segments import MessageSegment
+ >>> from nemere.inference.analyzers import Value
+ >>> from netzob.Model.Vocabulary.Symbol import Symbol
+ >>> from netzob.Model.Vocabulary.Messages.RawMessage import RawMessage
+ >>>
+ >>> dummymsg = RawMessage(bytes(list(range(20, 40))))
+ >>> dummyana = Value(dummymsg)
+ >>> testgapped = [[ MessageSegment(dummyana, 0, 2), MessageSegment(dummyana, 5, 2), MessageSegment(dummyana, 7, 6),
+ ... MessageSegment(dummyana, 17, 2) ]]
+ >>> symbol = symbolsFromSegments(testgapped)[0]
+ >>> print(symbol)
+ Field | Field | Field | Field | Field | Field | Field
+ ---------- | -------------- | ---------- | ----------------------- | ------ | ----- | -----
+ '\x14\x15' | '\x16\x17\x18' | '\x19\x1a' | '\x1b\x1c\x1d\x1e\x1f ' | '!"#$' | '%&' | "'"
+ ---------- | -------------- | ---------- | ----------------------- | ------ | ----- | -----
+
+ Intermediately produces:
+ ```
+ from pprint import pprint
+ pprint(filledSegments)
+ [[MessageSegment 2 bytes at (0, 2): 1415 | values: (20, 21),
+ MessageSegment 3 bytes at (2, 5): 161718 | values: (22, 23, 24),
+ MessageSegment 2 bytes at (5, 7): 191a | values: (25, 26),
+ MessageSegment 6 bytes at (7, 13): 1b1c1d1e1f20 | values: (27, 28, 29...,
+ MessageSegment 4 bytes at (13, 17): 21222324 | values: (33, 34, 35...,
+ MessageSegment 2 bytes at (17, 19): 2526 | values: (37, 38)]]
+ ````
+
:param segmentsPerMsg: List of messages, represented by lists of segments.
:return: list of Symbols, one for each entry in the given iterable of lists.
"""
- return [Symbol( [Field(segment.bytes) for segment in sorted(segSeq, key=lambda f: f.offset)],
- messages=[segSeq[0].message])
- for segSeq in segmentsPerMsg ]
+ sortedSegments = (sorted(segSeq, key=lambda f: f.offset) for segSeq in segmentsPerMsg)
+ filledSegments = list()
+ for segSeq in sortedSegments:
+ assert len(segSeq) > 0
+ filledGaps = list()
+ for segment in segSeq:
+ lastoffset = filledGaps[-1].nextOffset if len(filledGaps) > 0 else 0
+ if segment.offset > lastoffset:
+ gaplength = segment.offset - lastoffset
+ filledGaps.append(MessageSegment(segment.analyzer, lastoffset, gaplength))
+ filledGaps.append(segment)
+ # check for required trailing segment
+ lastoffset = filledGaps[-1].nextOffset
+ msglen = len(filledGaps[-1].message.data)
+ if lastoffset < msglen:
+ gaplength = msglen - lastoffset
+ filledGaps.append(MessageSegment(filledGaps[-1].analyzer, lastoffset, gaplength))
+ filledSegments.append(filledGaps)
+
+ return [ Symbol( [Field(segment.bytes) for segment in segSeq],
+ messages=[segSeq[0].message], name=f"nemesys Symbol {i}" )
+ for i,segSeq in enumerate(filledSegments) ]
def segmentsFromLabels(analyzer, labels) -> Tuple[TypedSegment]:
@@ -74,9 +122,8 @@ def segmentsFromLabels(analyzer, labels) -> Tuple[TypedSegment]:
return tuple(segments)
-# TODO replace parameter comparator by specimens
-def segmentsFixed(length: int, comparator,
- analyzerType: type, analysisArgs: Union[Tuple, None], unit=MessageAnalyzer.U_BYTE, padded=False) \
+def fixedlengthSegmenter(length: int, specimens: BaseLoader,
+ analyzerType: type, analysisArgs: Union[Tuple, None], unit=MessageAnalyzer.U_BYTE, padded=False) \
-> List[Tuple[MessageSegment]]:
"""
Segment messages into fixed size chunks.
@@ -84,10 +131,10 @@ def segmentsFixed(length: int, comparator,
>>> from nemere.utils.loader import SpecimenLoader
>>> from nemere.validation.dissectorMatcher import MessageComparator
>>> from nemere.inference.analyzers import Value
- >>> from nemere.inference.segmentHandler import segmentsFixed
+ >>> from nemere.inference.segmentHandler import fixedlengthSegmenter
>>> specimens = SpecimenLoader("../input/ntp_SMIA-20111010_deduped-100.pcap", 2, True)
>>> comparator = MessageComparator(specimens, 2, True, debug=False)
- >>> segmentedMessages = segmentsFixed(4, comparator, Value, None)
+ >>> segmentedMessages = fixedlengthSegmenter(4, specimens, Value, None)
>>> areIdentical = True
>>> for msgsegs in segmentedMessages:
... msg = msgsegs[0].message
@@ -98,7 +145,7 @@ def segmentsFixed(length: int, comparator,
:param length: Fixed length for all segments. Overhanging segments at the end that are shorter than length
will be padded with NANs.
- :param comparator: Comparator that contains the payload messages.
+ :param specimens: Loader utility class that contains the payload messages.
:param analyzerType: Type of the analysis. Subclass of inference.analyzers.MessageAnalyzer.
:param analysisArgs: Arguments for the analysis method.
:param unit: Base unit for the analysis. Either MessageAnalyzer.U_BYTE or MessageAnalyzer.U_NIBBLE.
@@ -107,7 +154,7 @@ def segmentsFixed(length: int, comparator,
:return: Segments of the analyzer's message according to the true format.
"""
segments = list()
- for l4msg, rmsg in comparator.messages.items():
+ for l4msg, rmsg in specimens.messagePool.items():
if len(l4msg.data) % length == 0: # exclude the overlap
lastOffset = len(l4msg.data)
else:
@@ -207,7 +254,7 @@ def bcDeltaGaussMessageSegmentation(specimens, sigma=0.6) -> List[List[MessageSe
-def refinements(segmentsPerMsg: List[List[MessageSegment]], dc: DistanceCalculator) -> List[List[MessageSegment]]:
+def refinements(segmentsPerMsg: List[List[MessageSegment]], **kwargs) -> List[List[MessageSegment]]:
"""
Refine the segmentation using specific improvements for the feature:
Inflections of gauss-filtered bit-congruence deltas.
@@ -217,27 +264,9 @@ def refinements(segmentsPerMsg: List[List[MessageSegment]], dc: DistanceCalculat
:param segmentsPerMsg: a list of one list of segments per message.
:return: refined segments in list per message
"""
- import inference.formatRefinement as refine
-
- print("Refine segmentation...")
-
- refinedPerMsg = list()
- for msg in segmentsPerMsg:
- # merge consecutive segments of printable-char values (\t, \n, \r, >= 0x20 and <= 0x7e) into one text field.
- charsMerged = refine.MergeConsecutiveChars(msg).merge()
- charSplited = refine.ResplitConsecutiveChars(charsMerged).split()
- refinedPerMsg.append(charSplited)
+ return nemetylRefinements(segmentsPerMsg)
- # for tests use test_segment-refinements.py
- moco = refine.CropDistinct.countCommonValues(refinedPerMsg)
- newstuff = list()
- for msg in refinedPerMsg:
- croppedMsg = refine.CropDistinct(msg, moco).split()
- charmerged = refine.CumulativeCharMerger(croppedMsg).merge()
- splitfixed = refine.SplitFixed(charmerged).split(0, 1)
- newstuff.append(splitfixed)
- return newstuff
def baseRefinements(segmentsPerMsg: Sequence[Sequence[MessageSegment]]) -> List[List[MessageSegment]]:
@@ -278,7 +307,7 @@ def nemetylRefinements(segmentsPerMsg: Sequence[Sequence[MessageSegment]]) -> Li
:param segmentsPerMsg: a list of one list of segments per message.
:return: refined segments in list per message
"""
- import inference.formatRefinement as refine
+ import nemere.inference.formatRefinement as refine
print("Refine segmentation (nemetyl refinements)...")
@@ -313,7 +342,7 @@ def charRefinements(segmentsPerMsg: Sequence[Sequence[MessageSegment]]) -> List[
:param segmentsPerMsg: a list of one list of segments per message.
:return: refined segments in list per message
"""
- import inference.formatRefinement as refine
+ import nemere.inference.formatRefinement as refine
print("Refine segmentation (char refinements)...")
@@ -341,9 +370,9 @@ def originalRefinements(segmentsPerMsg: Sequence[Sequence[MessageSegment]]) -> L
:param segmentsPerMsg: a list of one list of segments per message.
:return: refined segments in list per message
"""
- import inference.formatRefinement as refine
+ import nemere.inference.formatRefinement as refine
- print("Refine segmentation (WOOT18 refinements)...")
+ print("Refine segmentation (original WOOT18 refinements)...")
refinedPerMsg = list()
for msg in segmentsPerMsg:
@@ -441,8 +470,6 @@ def filterSegments(segments: Iterable[MessageSegment]) -> List[MessageSegment]:
return filteredSegments
def isExtendedCharSeq(values: bytes, meanCorridor=(50, 115), minLen=6):
- from nemere.inference.formatRefinement import locateNonPrintable
-
vallen = len(values)
nonzeros = [v for v in values if v > 0x00]
return (vallen >= minLen
@@ -454,7 +481,7 @@ def isExtendedCharSeq(values: bytes, meanCorridor=(50, 115), minLen=6):
# and 0.66 > len(locateNonPrintable(values)) / vallen # from smb one-char-many-zeros segments
)
-def filterChars(segments: Iterable[MessageSegment], meanCorridor=(50, 115), minLen=6):
+def filterChars(segments: Iterable[AbstractSegment], meanCorridor=(50, 115), minLen=6):
"""
Filter segments by some hypotheses about what might be a char sequence:
1. Segment is larger than minLen
@@ -475,3 +502,40 @@ def filterChars(segments: Iterable[MessageSegment], meanCorridor=(50, 115), minL
if isExtendedCharSeq(seg.bytes, meanCorridor, minLen)
]
return filtered
+
+
+def wobbleSegmentInMessage(segment: MessageSegment):
+ """
+ At start for now.
+
+ For end if would be, e. g.: if segment.nextOffset < len(segment.message.data): segment.nextOffset + 1
+
+ :param segment:
+ :return:
+ """
+ wobbles = [segment]
+
+ if segment.offset > 0:
+ wobbles.append(MessageSegment(segment.analyzer, segment.offset - 1, segment.length + 1))
+ if segment.length > 1:
+ wobbles.append(MessageSegment(segment.analyzer, segment.offset + 1, segment.length - 1))
+
+ return wobbles
+
+
+def locateNonPrintable(bstring: bytes) -> List[int]:
+ """
+ A bit broader definition of printable than python string's isPrintable()
+
+ :param bstring: a string of bytes
+ :return: position of bytes not in \t, \n, \r or between >= 0x20 and <= 0x7e
+ """
+ from nemere.inference.formatRefinement import isPrintableChar
+
+ npr = list()
+ for idx, bchar in enumerate(bstring):
+ if isPrintableChar(bchar):
+ continue
+ else:
+ npr.append(idx)
+ return npr
diff --git a/src/nemere/inference/segments.py b/src/nemere/inference/segments.py
index 037f048e..f76a524a 100644
--- a/src/nemere/inference/segments.py
+++ b/src/nemere/inference/segments.py
@@ -129,6 +129,53 @@ def findExistingAnalysis(analyzerclass: type, unit: int,
ac.analyze()
return ac
+ @staticmethod
+ def _convertSegmentSequenceAnalyzers(segments: Sequence['MessageSegment'], targetAnalyzer: Type['MessageAnalyzer'],
+ targetArguments=None):
+ return [MessageSegment(MessageAnalyzer.findExistingAnalysis(
+ targetAnalyzer, MessageAnalyzer.U_BYTE, seg.message, targetArguments), seg.offset, seg.length)
+ for seg in segments]
+
+ @staticmethod
+ def convertAnalyzers(segmentsPerMsg: Sequence[Union[Sequence['MessageSegment'],'MessageSegment']],
+ targetAnalyzer: Type['MessageAnalyzer'], targetArguments=()):
+ """
+ Converts a list of given MessageSegments or a list of lists of MessageSegments with any MessageAnalyzer used in
+ any of the MessageSegments to the desired targetAnalyzer.
+ It automatically prevents the generation of new MessageSegment instances, if the input already uses
+ the target analyzer with the given targetArguments.
+
+ >>> from nemere import SpecimenLoader
+ >>> from nemere.inference.segmentHandler import bcDeltaGaussMessageSegmentation
+ >>> from nemere.inference.segments import MessageSegment, MessageAnalyzer
+ >>> from nemere.inference.analyzers import Value
+ >>> specimens = SpecimenLoader("../input/ntp_SMIA-20111010_deduped-100.pcap", 2, True)
+ >>> segmentsPerMsg = bcDeltaGaussMessageSegmentation(specimens, 1.2)
+ >>> vps = MessageAnalyzer.convertAnalyzers(segmentsPerMsg, Value)
+ >>> anothervps = MessageAnalyzer.convertAnalyzers(vps, Value)
+ >>> vps == anothervps
+ True
+
+ :param segmentsPerMsg: List or list of lists of MessageSegments for which the analyzer should be converted.
+ :param targetAnalyzer: Desired MessageAnalyzer. Must be a subclass of MessageAnalyzer.
+ :param targetArguments: The arguments for the MessageAnalyzer. See MessageAnalyzer.setAnalysisParams()
+ :return: List or list of lists, the same structure as the input segmentsPerMsg
+ """
+ if targetArguments is None:
+ targetArguments = ()
+ if all(isinstance(msg, Sequence) for msg in segmentsPerMsg):
+ # everything is already in desired target analyzer: return untouched
+ if all(isinstance(seg.analyzer, targetAnalyzer) and targetArguments == seg.analyzer.analysisParams
+ for msg in segmentsPerMsg for seg in msg):
+ return segmentsPerMsg
+ return [MessageAnalyzer._convertSegmentSequenceAnalyzers(msg, targetAnalyzer, targetArguments)
+ for msg in segmentsPerMsg]
+ else:
+ # everything is already in desired target analyzer: return untouched
+ if all(isinstance(seg.analyzer, targetAnalyzer) and targetArguments == seg.analyzer.analysisParams
+ for seg in segmentsPerMsg):
+ return segmentsPerMsg
+ return MessageAnalyzer._convertSegmentSequenceAnalyzers(segmentsPerMsg, targetAnalyzer, targetArguments)
def ngrams(self, n: int):
"""
@@ -343,18 +390,13 @@ def calcEntropy(tokens, alphabet_len = 2):
:return: entropy in token list
"""
- # unit = U_BYTE nibble => ASCII ?!
- alphabet = dict()
+ from collections import Counter
# get counts for each word of the alphabet
- for x in tokens:
- if x in alphabet:
- alphabet[x] += 1
- else:
- alphabet[x] = 1
+ alphabet = Counter(tokens)
entropy = 0
for x in alphabet:
- # probability of value in string
+ # probability of value in tokens
p_x = float(alphabet[x]) / len(tokens)
entropy += - p_x * math.log(p_x, alphabet_len)
@@ -426,9 +468,12 @@ def __init__(self):
self.length = None
@property
- def values(self) -> Tuple:
+ def values(self) -> Union[Tuple, None]:
return self._values
+ def __len__(self):
+ return self.length
+
T = TypeVar('T')
def fillCandidate(self, candidate: T) -> T:
"""
@@ -602,6 +647,8 @@ def __init__(self, analyzer: MessageAnalyzer,
raise ValueError('Offset is not an int.')
if not isinstance(length, int):
raise ValueError('Length is not an int. Its representation is ', repr(length))
+ if not length >= 1:
+ raise ValueError('Length must be a positive number, not ', repr(length))
if offset >= len(self.message.data):
raise ValueError('Offset {} too large for message of length {}.'.format(offset, len(self.message.data)))
if offset+length-1 > len(self.message.data):
@@ -614,10 +661,6 @@ def __init__(self, analyzer: MessageAnalyzer,
"""byte count of the segment this object represents in the originating message"""
- def __len__(self):
- return self.length
-
-
@property
def analyzer(self):
return self._analyzer
diff --git a/src/nemere/inference/templates.py b/src/nemere/inference/templates.py
index 0ebc8b40..050e15e2 100644
--- a/src/nemere/inference/templates.py
+++ b/src/nemere/inference/templates.py
@@ -66,7 +66,7 @@ def __init__(self, segments: Iterable[AbstractSegment], method='canberra',
self._method = method
self.thresholdFunction = thresholdFunction if thresholdFunction else DistanceCalculator.neutralThreshold
self.thresholdArgs = thresholdArgs if thresholdArgs else {}
- self._segments = list() # type: List[MessageSegment]
+ self._segments = list() # type: List[AbstractSegment]
self._quicksegments = list() # type: List[Tuple[int, int, Tuple[float]]]
"""List of Tuples: (index of segment in self._segments), (segment length), (Tuple of segment analyzer values)"""
# ensure that all segments have analysis values
@@ -80,7 +80,7 @@ def __init__(self, segments: Iterable[AbstractSegment], method='canberra',
self._distances = type(self)._getDistanceMatrix(self._embdedAndCalcDistances(), len(self._quicksegments))
# prepare lookup for matrix indices
- self._seg2idx = {seg: idx for idx, seg in enumerate(self._segments)}
+ self._seg2idx = {seg: idx for idx, seg in enumerate(self._segments)} # type: Dict[AbstractSegment, int]
if manipulateChars:
# Manipulate calculated distances for all char/char pairs.
@@ -167,7 +167,7 @@ def similarityMatrix(self) -> numpy.ndarray:
return similarityMatrix
@property
- def segments(self) -> List[MessageSegment]:
+ def segments(self) -> List[AbstractSegment]:
"""
:return: All segments in this object.
"""
@@ -661,6 +661,7 @@ def embedSegment(shortSegment: Tuple[int, int, Tuple[float]], longSegment: Tuple
subsetsSimi = scipy.spatial.distance.cdist(segmentValuesMatrix, numpy.array([shortSegment[2]]), method)
shift = subsetsSimi.argmin() # for debugging and evaluation
+ # noinspection PyArgumentList
distance = subsetsSimi.min()
return method, shift, (shortSegment[0], longSegment[0], distance)
@@ -703,7 +704,7 @@ def _embdedAndCalcDistances(self) -> \
complete distance list of all combinations of the into segment list regardless of their length.
>>> from tabulate import tabulate
- >>> from utils.baseAlgorithms import generateTestSegments
+ >>> from nemere.utils.baseAlgorithms import generateTestSegments
>>> segments = generateTestSegments()
>>> DistanceCalculator.debug = False
>>> dc = DistanceCalculator(segments)
@@ -739,10 +740,12 @@ def _embdedAndCalcDistances(self) -> \
:return: List of Tuples
(index of segment in self._segments), (segment length), (Tuple of segment analyzer values)
"""
+ from concurrent.futures.process import BrokenProcessPool
+ import time
+
dissCount = 0
lenGrps = self.groupByLength() # segment list is in format of self._quicksegments
- import time
pit_start = time.time()
rslens = list(reversed(sorted(lenGrps.keys()))) # lengths, sorted by decreasing length
@@ -761,13 +764,20 @@ def _embdedAndCalcDistances(self) -> \
# int_runtime, self.segments[0].message.data[:5].hex(), outerlen))
else:
import concurrent.futures
- with concurrent.futures.ProcessPoolExecutor(max_workers=cpu_count()) as executor: # Process # Thread
+ with concurrent.futures.ProcessPoolExecutor(max_workers=cpu_count()-1) as executor: # Process # Thread
futureDis = dict()
for outerlen in rslens:
futureDis[executor.submit(self._outerloop, lenGrps, outerlen, rslens)] = outerlen
futureRes = dict()
for future in concurrent.futures.as_completed(futureDis.keys()):
- futureRes[futureDis[future]] = future.result()
+ try:
+ futureRes[futureDis[future]] = future.result()
+ except BrokenProcessPool as e:
+ import IPython
+ print("Process failed. outerlen ", outerlen)
+ print()
+ IPython.embed()
+ raise e
for ol in rslens:
for diss in futureRes[ol]:
dissCount += 1
@@ -896,7 +906,7 @@ def normFactor(method: str, dimensions: int, analyzerDomain: Tuple[float, float]
distanceMax['sqeuclidean'] = dimensions * domainSize**2
return 1 / distanceMax[method]
- def neigbors(self, segment: AbstractSegment, subset: List[MessageSegment]=None) -> List[Tuple[int, float]]:
+ def neighbors(self, segment: AbstractSegment, subset: List[MessageSegment]=None) -> List[Tuple[int, float]]:
# noinspection PyUnresolvedReferences
"""
@@ -920,7 +930,7 @@ def neigbors(self, segment: AbstractSegment, subset: List[MessageSegment]=None)
>>> DistanceCalculator.debug = False
>>> dc = DistanceCalculator(segments)
Calculated distances for 37 segment pairs in ... seconds.
- >>> nbrs = dc.neigbors(segments[2], segments[3:7])
+ >>> nbrs = dc.neighbors(segments[2], segments[3:7])
>>> dsts = [dc.pairDistance(segments[2], segments[3]),
... dc.pairDistance(segments[2], segments[4]),
... dc.pairDistance(segments[2], segments[5]),
@@ -954,9 +964,11 @@ def neigbors(self, segment: AbstractSegment, subset: List[MessageSegment]=None)
@staticmethod
def _checkCacheFile(analysisTitle: str, tokenizer: str, pcapfilename: str):
- from os.path import splitext, basename, exists
+ from os.path import splitext, basename, exists, join
+ from nemere.utils.evaluationHelpers import cacheFolder
pcapName = splitext(basename(pcapfilename))[0]
dccachefn = 'cache-dc-{}-{}-{}.{}'.format(analysisTitle, tokenizer, pcapName, 'dc')
+ dccachefn = join(cacheFolder, dccachefn)
if not exists(dccachefn):
return False, dccachefn
else:
@@ -1074,8 +1086,14 @@ def __init__(self, values: Union[Tuple[Union[float, int]], MessageSegment],
@property
def bytes(self):
if isinstance(self._values, numpy.ndarray):
- return bytes(self._values.astype(int).tolist())
+ if any(numpy.isnan(self._values)):
+ return None # TODO relevant for #10
+ bi = self._values.astype(int).tolist()
+ # noinspection PyTypeChecker
+ return bytes(bi)
if isinstance(self._values, Iterable):
+ if any(numpy.isnan(self._values)):
+ return None # TODO relevant for #10
return bytes(self._values)
return None
@@ -1087,7 +1105,7 @@ def analyzer(self):
def checkSegmentsAnalysis(self):
"""
- Validate that all base segments of this tempalte are configured with the same type of analysis.
+ Validate that all base segments of this template are configured with the same type of analysis.
:raises: ValueError if not all analysis types and parameters of the base segments are identical.
:return: Doesn't return anything if all is well. Raises a ValueError otherwise.
@@ -1169,7 +1187,7 @@ def distancesToMixedLength(self, dc: DistanceCalculator=None):
0.246... 0
0.373... 0
0.285... 0
- 0.5... 1
+ 0.539... 1
0.497... 0
0.825... -3
0.682... 1
@@ -1233,6 +1251,7 @@ def distToNearest(self, segment: Union[MessageSegment, Sequence[MessageSegment]]
segments = segment
else:
segments = [segment]
+ # noinspection PyArgumentList
return dc.distancesSubset(segments, self.baseSegments).min()
def __hash__(self):
@@ -1319,7 +1338,7 @@ def fieldtype(self, value: str):
* returning translation from segment to representative's indices:
* segments2index()
* internally: pairDistance() / distancesSubset()
- * internally: neigbors
+ * internally: neighbors
* internally: findMedoid
"""
class DelegatingDC(DistanceCalculator):
@@ -1469,12 +1488,8 @@ def _templates4allZeros(segments: Iterable[MessageSegment]):
numpy.count_nonzero(s.values) - numpy.count_nonzero(numpy.isnan(s.values)) > 0]
raise NotImplementedError()
- @staticmethod
- def _templates4Paddings(segments: Iterable[MessageSegment]):
- raise NotImplementedError()
-
- def segments2index(self, segmentList: Iterable[MessageSegment]):
+ def segments2index(self, segmentList: Iterable[AbstractSegment]):
# noinspection PyUnresolvedReferences
"""
Look up the indices of the given segments.
@@ -1554,7 +1569,7 @@ def pairDistance(self, A: MessageSegment, B: MessageSegment) -> numpy.float64:
b = self._seg2idx[B] if B in self._seg2idx else self.reprMap[B]
return self._distances[a, b]
- def distancesSubset(self, As: Sequence[MessageSegment], Bs: Sequence[MessageSegment] = None) \
+ def distancesSubset(self, As: Sequence[AbstractSegment], Bs: Sequence[AbstractSegment] = None) \
-> numpy.ndarray:
"""
Retrieve a matrix of pairwise distances for two lists of segments, resolving representatives internally if necessary.
diff --git a/src/nemere/utils/baseAlgorithms.py b/src/nemere/utils/baseAlgorithms.py
index 69a65887..e551f000 100644
--- a/src/nemere/utils/baseAlgorithms.py
+++ b/src/nemere/utils/baseAlgorithms.py
@@ -27,7 +27,6 @@ def sad(v, u):
raise ValueError("Vectors need to be of equal length.")
return numpy.sum(numpy.abs(numpy.subtract(v, u)))
-
def tril(arrayIn: numpy.ndarray) -> numpy.ndarray:
"""
>>> a = numpy.array([[1,2,3,4],[2,3,4,5],[3,4,5,6],[4,5,6,7]])
@@ -35,12 +34,21 @@ def tril(arrayIn: numpy.ndarray) -> numpy.ndarray:
array([2, 3, 4, 4, 5, 6])
:param arrayIn: a symmetrical matrix
- :return: lower triangle values of arrayIn removing the identity (diagonal).
+ :return: Value list of the lower triangle of arrayIn without the identity (diagonal).
"""
premask = numpy.full_like(arrayIn, True, bool)
- mask = numpy.tril(premask, k=-1) # mask including the first diagonal
+ mask = numpy.tril(premask, k=-1) # mask also including the first diagonal
return arrayIn[mask]
+def trilNaN(distances: numpy.ndarray):
+ """
+ :param distances: a symmetrical matrix
+ :return: lower triangle in original shape. Upper triangle and identity (diagonal) set to nan.
+ """
+ mask = numpy.tril(numpy.full_like(distances, True, bool), k=-1)
+ dist = distances.copy()
+ dist[~mask] = numpy.nan
+ return dist
def generateTestSegments():
from netzob.Model.Vocabulary.Messages.RawMessage import RawMessage
diff --git a/src/nemere/utils/evaluationHelpers.py b/src/nemere/utils/evaluationHelpers.py
index 15045eaa..7d1ed2f1 100644
--- a/src/nemere/utils/evaluationHelpers.py
+++ b/src/nemere/utils/evaluationHelpers.py
@@ -2,16 +2,20 @@
Module encapsulating evaluation parameters and helper functions to validate aspects of the
NEMESYS and NEMETYL approaches.
"""
-from typing import TypeVar, Hashable, Sequence, Callable, Iterable
-from netzob.all import RawMessage
+from collections import defaultdict, Counter
+from typing import TypeVar, Sequence, Callable, Iterable, Optional
from itertools import chain
import os, csv, pickle, time
+from os.path import join, splitext, isfile, isdir, basename, exists, abspath
+from tabulate import tabulate
+from nemere import inferred4segment, markSegNearMatch
from nemere.utils.loader import SpecimenLoader
-from nemere.validation.dissectorMatcher import MessageComparator
+from nemere.validation.dissectorMatcher import MessageComparator, BaseComparator
from nemere.inference.analyzers import *
+from nemere.inference.formatRefinement import isOverlapping
from nemere.inference.segmentHandler import segmentsFromLabels, bcDeltaGaussMessageSegmentation, refinements, \
- segmentsFixed
+ fixedlengthSegmenter
from nemere.inference.segments import MessageAnalyzer, TypedSegment, MessageSegment, AbstractSegment
from nemere.inference.templates import DistanceCalculator, DelegatingDC, Template, MemmapDC
@@ -102,8 +106,6 @@
cacheFolder = "cache"
clStatsFile = os.path.join(reportFolder, 'messagetype-cluster-statistics.csv')
ccStatsFile = os.path.join(reportFolder, 'messagetype-combined-cluster-statistics.csv')
-scStatsFile = os.path.join(reportFolder, 'segment-cluster-statistics.csv')
-coStatsFile = os.path.join(reportFolder, 'segment-collective-cluster-statistics.csv')
unknown = "[unknown]"
@@ -120,298 +122,8 @@ def annotateFieldTypes(analyzerType: type, analysisArgs: Union[Tuple, None], com
for l4msg, rmsg in comparator.messages.items()]
return segmentedMessages
-
-def writeIndividualMessageClusteringStaticstics(
- clusters: Dict[Hashable, List[Tuple[MessageSegment]]], groundtruth: Dict[RawMessage, str],
- runtitle: str, comparator: MessageComparator):
- """
- calculate conciseness, correctness = precision, and recall
-
- """
- abstrMsgClusters = {lab : [comparator.messages[element[0].message] for element in segseq]
- for lab, segseq in clusters.items()}
- return writeIndividualClusterStatistics(abstrMsgClusters, groundtruth, runtitle, comparator)
-
-
-def writeIndividualClusterStatistics(
- clusters: Dict[Hashable, List[Element]], groundtruth: Dict[Element, str],
- runtitle: str, comparator: MessageComparator):
- # clusters: clusterlabel : List of Segments (not Templates!)
- # groundtruth: Lookup for Segment : true type string
- from collections import Counter
-
- outfile = clStatsFile if isinstance(next(iter(groundtruth.keys())), AbstractMessage) else scStatsFile
- print('Write {} cluster statistics to {}...'.format(
- "message" if isinstance(next(iter(groundtruth.keys())), AbstractMessage) else "segment",
- outfile))
-
- numSegs = 0
- prList = []
- noise = None
- if 'Noise' in clusters:
- noisekey = 'Noise'
- elif -1 in clusters:
- noisekey = -1
- else:
- noisekey = None
-
- if noisekey:
- prList.append(None)
- noise = clusters[noisekey]
- clusters = {k: v for k, v in clusters.items() if k != noisekey} # remove the noise
-
- numClusters = len(clusters)
- numTypesOverall = Counter(groundtruth.values())
- numTypes = len(numTypesOverall)
- conciseness = numClusters / numTypes
-
- for label, cluster in clusters.items():
- # we assume correct Tuples of MessageSegments with all objects in one Tuple originating from the same message
- typeFrequency = Counter([groundtruth[element] for element in cluster])
- mostFreqentType, numMFTinCluster = typeFrequency.most_common(1)[0]
- numSegsinCuster = len(cluster)
- numSegs += numSegsinCuster
-
- precision = numMFTinCluster / numSegsinCuster
- recall = numMFTinCluster / numTypesOverall[mostFreqentType]
-
- prList.append((label, mostFreqentType, precision, recall, numSegsinCuster))
-
- # noise statistics
- if noise:
- numNoise = len(noise)
- numSegs += numNoise
- ratioNoise = numNoise / numSegs
- noiseTypes = {groundtruth[element] for element in noise}
-
- csvWriteHead = False if os.path.exists(outfile) else True
- with open(outfile, 'a') as csvfile:
- clStatscsv = csv.writer(csvfile) # type: csv.writer
- if csvWriteHead:
- # in "pagetitle": "seg_length", "analysis", "dist_measure", 'min_cluster_size'
- clStatscsv.writerow([
- 'run_title', 'trace', 'conciseness', 'cluster_label', 'most_freq_type', 'precision', 'recall', 'cluster_size'])
- if noise:
- # noinspection PyUnboundLocalVariable
- clStatscsv.writerow([
- runtitle, comparator.specimens.pcapFileName, conciseness, 'NOISE', str(noiseTypes), 'ratio:', ratioNoise, numNoise])
- clStatscsv.writerows([
- (runtitle, comparator.specimens.pcapFileName, conciseness, *pr) for pr in prList if pr is not None
- ])
-
- return prList, conciseness
-
-
-
-def writeCollectiveClusteringStaticstics(
- clusters: Dict[Hashable, List[Element]], groundtruth: Dict[Element, str],
- runtitle: str, comparator: MessageComparator, ignoreUnknown=True):
- """
- Precision and recall for the whole clustering interpreted as number of draws from pairs of messages.
-
- For details see: https://nlp.stanford.edu/IR-book/html/htmledition/evaluation-of-clustering-1.html
- How to calculate the draws is calculated for the Rand index in the document.
-
- Writes a CSV with tp, fp, fn, tn, pr, rc
- for (a) all clusters and for (b) clusters that have a size of at least 1/40 of the number of samples/messages.
-
- 'total segs' and 'unique segs' are including 'unknown' and 'noise'
- """
- from collections import Counter
- from itertools import combinations, chain
- from scipy.special import binom
-
- outfile = ccStatsFile if isinstance(next(iter(groundtruth.keys())), AbstractMessage) else coStatsFile
- print('Write message cluster statistics to {}...'.format(outfile))
-
- segTotal = sum(
- sum(len(el.baseSegments) if isinstance(el, Template) else 1 for el in cl)
- for cl in clusters.values() )
- segUniqu = sum(len(cl) for cl in clusters.values())
-
- if ignoreUnknown:
- unknownKeys = ["[unknown]", "[mixed]"]
- numUnknown = len([gt for gt in groundtruth.values() if gt in unknownKeys])
- clustersTemp = {lab: [el for el in clu if groundtruth[el] not in unknownKeys] for lab, clu in clusters.items()}
- clusters = {lab: elist for lab, elist in clustersTemp.items() if len(elist) > 0}
- groundtruth = {sg: gt for sg,gt in groundtruth.items() if gt not in unknownKeys}
- else:
- numUnknown = "n/a"
-
- noise = []
- noisekey = 'Noise' if 'Noise' in clusters else -1 if -1 in clusters else None
- # print("noisekey", noisekey)
- if noisekey is not None:
- noise = clusters[noisekey]
- clusters = {k: v for k, v in clusters.items() if k != noisekey} # remove the noise
-
- """
- # # # # # # # #
- # test case
- >>> groundtruth = {
- >>> "x0": "x", "x1": "x", "x2": "x", "x3": "x", "x4": "x", "x5": "x", "x6": "x", "x7": "x",
- >>> "o0": "o", "o1": "o", "o2": "o", "o3": "o", "o4": "o",
- >>> "#0": "#", "#1": "#", "#2": "#", "#3": "#"
- >>> }
- >>> clusters = { "A": ["x1", "x2", "x3", "x4", "x5", "o1"],
- >>> "B": ["x6", "o2", "o3", "o4", "o0", "#1"],
- >>> "C": ["x7", "x0", "#2", "#3", "#0"],
- >>> }
- >>> typeFrequencies = [Counter([groundtruth[element] for element in c])
- for c in clusters.values()]
- # # # # # # # #
- """
-
- # numTypesOverall = Counter(groundtruth[comparator.messages[element[0].message]]
- # for c in clusters.values() for element in c)
- numTypesOverall = Counter(groundtruth.values())
- # number of types per cluster
- typeFrequencies = [Counter([groundtruth[element] for element in c])
- for c in clusters.values()]
- noiseTypes = Counter([groundtruth[element] for element in noise])
-
- tpfp = sum(binom(len(c), 2) for c in clusters.values())
- tp = sum(binom(t,2) for c in typeFrequencies for t in c.values())
- tnfn = sum(map(lambda n: n[0] * n[1], combinations(
- (len(c) for c in chain.from_iterable([clusters.values(), [noise]])), 2))) + \
- sum(binom(noiseTypes[typeName],2) for typeName, typeTotal in noiseTypes.items())
- # import IPython; IPython.embed()
- # fn = sum(((typeTotal - typeCluster[typeName]) * typeCluster[typeName]
- # for typeCluster in typeFrequencies + [noiseTypes]
- # for typeName, typeTotal in numTypesOverall.items() if typeName in typeCluster))//2
- #
- # # noise handling: consider all elements in noise as false negatives
- fn = sum(((typeTotal - typeCluster[typeName]) * typeCluster[typeName]
- for typeCluster in typeFrequencies
- for typeName, typeTotal in numTypesOverall.items() if typeName in typeCluster))//2 + \
- sum((binom(noiseTypes[typeName],2) +
- (
- (typeTotal - noiseTypes[typeName]) * noiseTypes[typeName]
- )//2
- for typeName, typeTotal in numTypesOverall.items() if typeName in noiseTypes))
-
-
- # precision = tp / (tp + fp)
- precision = tp / tpfp
- recall = tp / (tp + fn)
-
- head = [ 'run_title', 'trace', 'true positives', 'false positives', 'false negatives', 'true negatives',
- 'precision', 'recall', 'noise', 'unknown', 'total segs', 'unique segs']
- row = [ runtitle, comparator.specimens.pcapFileName, tp, tpfp-tp, fn, tnfn-fn,
- precision, recall, len(noise), numUnknown, segTotal, segUniqu ]
-
- csvWriteHead = False if os.path.exists(outfile) else True
- with open(outfile, 'a') as csvfile:
- clStatscsv = csv.writer(csvfile) # type: csv.writer
- if csvWriteHead:
- clStatscsv.writerow(head)
- clStatscsv.writerow(row)
-
-
-def writeCollectiveMessageClusteringStaticstics(
- messageClusters: Dict[Hashable, List[Tuple[MessageSegment]]], groundtruth: Dict[RawMessage, str],
- runtitle: str, comparator: MessageComparator):
- abstrMsgClusters = {lab : [comparator.messages[element[0].message] for element in segseq]
- for lab, segseq in messageClusters.items()}
- return writeCollectiveClusteringStaticstics(abstrMsgClusters, groundtruth, runtitle, comparator)
-
-
-def plotMultiSegmentLines(segmentGroups: List[Tuple[str, List[Tuple[str, TypedSegment]]]],
- specimens: SpecimenLoader, pagetitle=None, colorPerLabel=False,
- typeDict: Dict[str, List[MessageSegment]] = None,
- isInteractive=False):
- """
- This is a not awfully important helper function saving the writing of a few lines code.
-
- :param segmentGroups:
- :param specimens:
- :param pagetitle:
- :param colorPerLabel:
- :param typeDict: dict of types (str-keys: list of segments) present in the segmentGroups
- :param isInteractive:
- :return:
- """
- from nemere.visualization.multiPlotter import MultiMessagePlotter
-
- mmp = MultiMessagePlotter(specimens, pagetitle, len(segmentGroups), isInteractive=isInteractive)
- mmp.plotMultiSegmentLines(segmentGroups, colorPerLabel)
-
- # TODO redundant code of writeIndividualMessageClusteringStaticstics
- if typeDict: # calculate conciseness, correctness = precision, and recall
- import os, csv
- from collections import Counter
- from nemere.inference.templates import Template
-
- # mapping from each segment in typeDict to the corresponding cluster and true type,
- # considering representative templates
- segment2type = {seg: ft for ft, segs in typeDict.items() for seg in segs}
- clusters = list()
- for label, segList in segmentGroups:
- cluster = list()
- for tl, seg in segList:
- if isinstance(seg, Template):
- cluster.extend((tl, bs) for bs in seg.baseSegments)
- else:
- cluster.append((tl, seg))
- clusters.append(cluster)
-
- numSegs = len(segment2type)
- prList = []
- noise = None
- if 'Noise' in segmentGroups[0][0]:
- noise, *clusters = clusters # remove the noise
- prList.append(None)
-
- numClusters = len(clusters)
- numFtypes = len(typeDict)
- conciseness = numClusters / numFtypes
-
- for clusterSegs in clusters:
- # type from typeDict
- typeKey, numMFTinCluster = Counter(segment2type[seg] for tl, seg in clusterSegs).most_common(1)[0]
- # number of segments for the prevalent type in the trace
- numMFToverall = len(typeDict[typeKey])
- numSegsinCuster = len(clusterSegs)
-
- precision = numMFTinCluster / numSegsinCuster
- recall = numMFTinCluster / numMFToverall
-
- # # rather do not repeat the amount in the label
- # mostFrequentType = "{}: {} Seg.s".format(typeKey, numMFTinCluster)
- mostFrequentType = typeKey
- prList.append((mostFrequentType, precision, recall, numSegsinCuster))
-
- mmp.textInEachAx(["precision = {:.2f}\n" # correctness
- "recall = {:.2f}".format(pr[1], pr[2]) if pr else None for pr in prList])
-
- # noise statistics
- if noise:
- numNoise = len(noise)
- ratioNoise = numNoise / numSegs
- noiseTypes = {ft for ft, seg in noise}
-
-
- csvWriteHead = False if os.path.exists(scStatsFile) else True
- with open(scStatsFile, 'a') as csvfile:
- clStatscsv = csv.writer(csvfile) # type: csv.writer
- if csvWriteHead:
- # in "pagetitle": "seg_length", "analysis", "dist_measure", 'min_cluster_size'
- clStatscsv.writerow(['run_title', 'trace', 'conciseness', 'most_freq_type',
- 'precision', 'recall', 'cluster_size'])
- if noise:
- # noinspection PyUnboundLocalVariable
- clStatscsv.writerow([pagetitle, specimens.pcapFileName, conciseness, 'NOISE',
- str(noiseTypes), ratioNoise, numNoise])
- clStatscsv.writerows([
- (pagetitle, specimens.pcapFileName, conciseness, *pr) for pr in prList if pr is not None
- ])
-
- mmp.writeOrShowFigure()
- del mmp
-
-
def labelForSegment(segGrpHier: List[Tuple[str, List[Tuple[str, List[Tuple[str, TypedSegment]]]]]],
- seg: AbstractSegment) -> Union[str, bool]:
+ seg: AbstractSegment) -> Union[str, None]:
"""
Determine group label of an segment from deep hierarchy of segment clusters/groups.
@@ -441,7 +153,31 @@ def labelForSegment(segGrpHier: List[Tuple[str, List[Tuple[str, List[Tuple[str,
else:
return "[unknown]"
- return False
+ return None
+
+def consolidateLabels(labels: numpy.ndarray, trigger = "$d_{max}$=0.000", maxLabels=20):
+ """
+ Replace the labels in the input, in-place, if the trigger is contained in a label.
+ If after this procedure still more than maxLabels distinct labels are remaining, only the 20 largest are retained.
+ """
+ zeroDmaxUnique = {c for c in labels if isinstance(labels, Iterable) and trigger in c}
+ zeroDcount = len(zeroDmaxUnique)
+ commonLabel = f"one of {zeroDcount} clusters with $d_{{max}}$=0.000"
+ for ci in range(labels.size):
+ if trigger in labels[ci]:
+ labels[ci] = commonLabel
+ lCounter = Counter(labels)
+ if len(lCounter) > maxLabels:
+ print("Still too many cluster labels! Merging the smallest ones, retaining 20 clusters.")
+ leastCommon = list(zip(*list(lCounter.most_common())[20:]))[0]
+ lcAmount = len(leastCommon)
+ if commonLabel in leastCommon:
+ lcAmount += zeroDcount
+ lcLabel = f"one of the {lcAmount} smallest clusters"
+ for ci in range(labels.size):
+ if labels[ci] in leastCommon:
+ labels[ci] = lcLabel
+ return labels
def writePerformanceStatistics(specimens, clusterer, algos,
@@ -584,115 +320,244 @@ def calcHexDist(hexA, hexB):
return dc.pairDistance(*segments)
-def cacheAndLoadDC(pcapfilename: str, analysisTitle: str, tokenizer: str, debug: bool,
- analyzerType: type, analysisArgs: Tuple=None, sigma: float=None, filterTrivial=False,
- refinementCallback:Union[Callable, None] = refinements,
- disableCache=False) \
- -> Tuple[SpecimenLoader, MessageComparator, List[Tuple[MessageSegment]], DistanceCalculator,
- float, float]:
- """
- cache or load the DistanceCalculator to or from the filesystem
-
-
- :param filterTrivial: Filter out **one-byte** segments and such, just consisting of **zeros**.
- :param disableCache: When experimenting with distances manipulation, deactivate caching!
- :return:
- """
- pcapbasename = os.path.basename(pcapfilename)
- # if refinementCallback == pcaMocoRefinements:
- # sigma = pcamocoSigmapertrace[pcapbasename] if not sigma and pcapbasename in pcamocoSigmapertrace else \
- # 0.9 if not sigma else sigma
- # else:
- sigma = sigmapertrace[pcapbasename] if not sigma and pcapbasename in sigmapertrace else \
- 0.9 if not sigma else sigma
- pcapName = os.path.splitext(pcapbasename)[0]
- # noinspection PyUnboundLocalVariable
- tokenparm = tokenizer if tokenizer != "nemesys" else \
- "{}{:.0f}".format(tokenizer, sigma * 10)
- dccachefn = os.path.join(cacheFolder, 'cache-dc-{}-{}-{}-{}-{}.{}'.format(
- analysisTitle, tokenparm, "filtered" if filterTrivial else "all",
- refinementCallback.__name__ if refinementCallback is not None else "raw",
- pcapName, 'ddc'))
- # dccachefn = 'cache-dc-{}-{}-{}.{}'.format(analysisTitle, tokenizer, pcapName, 'dc')
- if disableCache or not os.path.exists(dccachefn):
- # dissect and label messages
- print("Load messages from {}...".format(pcapName))
- specimens = SpecimenLoader(pcapfilename, 2, True)
- comparator = MessageComparator(specimens, 2, True, debug=debug)
+class CachedDistances(object):
+ def __init__(self, pcapfilename: str, analysisTitle: str, layer=2, relativeToIP=True):
+ """
+ Cache or load the DistanceCalculator to or from the filesystem
+ """
+ self.pcapfilename = pcapfilename # type: str
+ self.pcapbasename = os.path.basename(pcapfilename)
+ self.pcapName = os.path.splitext(self.pcapbasename)[0]
+
+ self.layer = layer # type: int
+ self.relativeToIP = relativeToIP # type: bool
+ self.analysisTitle = analysisTitle # type: str
+ self.analyzerType = analyses[analysisTitle] # type: Type[MessageAnalyzer]
+ self.analysisArgs = None # type: Union[None, Tuple]
+
+ self.tokenizer = None # type: Union[None, str]
+ self.sigma = None # type: Union[None, float]
+ self.filter = False # type: bool
+ self.refinementCallback = None # type: Union[Callable, None]
+ self.refinementArgs = None # type: Union[None, Dict]
+ """kwargs for the refinement function, e. g., reportFolder or collectedSubclusters"""
+ self.forwardComparator = False # type: bool
+
+ self.dissectGroundtruth = True
+ """Expect tshark to know a dissector for the protocol and initialize a MessageComparator for it.
+ Set to False for an unknown protocol!"""
+ self.disableCache = False # type: bool
+ """When experimenting with distances manipulation, deactivate caching by setting disableCache to True!"""
+ self.debug = False # type: bool
+
+ self.dccachefn = None # type: Union[None, str]
+ self.isLoaded = False
+
+ self.specimens = None # type: Union[None, SpecimenLoader]
+ self.comparator = None # type: Union[None, BaseComparator]
+ self.segmentedMessages = None # type: Union[None, List[Tuple[MessageSegment]]]
+ self.rawSegmentedMessages = None # type: Union[None, List[Tuple[MessageSegment]]]
+ self.dc = None # type: Union[None, DistanceCalculator]
+ self.segmentationTime = None # type: Union[None, float]
+ self.dist_calc_segmentsTime = None # type: Union[None, float]
+
+ def configureAnalysis(self, *analysisArgs):
+ """optional"""
+ self.analysisArgs = analysisArgs
+
+ def configureTokenizer(self, tokenizer: str, sigma: float=None, filtering=False):
+ """
+ mandatory (but may be called without parameters)
+
+ :param tokenizer: The tokenizer to use. One of: "tshark", "4bytesfixed", "nemesys"
+ :param sigma: Required only for nemesys: The sigma value to use. If not set,
+ the value in sigmapertrace in this module is looked up and if the trace is not known there, use 0.9.
+ :param filtering: Filter out **one-byte** segments and such, just consisting of **zeros**.
+ """
+ # if refinementCallback == pcaMocoRefinements:
+ # sigma = pcamocoSigmapertrace[pcapbasename] if not sigma and pcapbasename in pcamocoSigmapertrace else \
+ # 0.9 if not sigma else sigma
+ # else:
+ self.sigma = sigmapertrace[self.pcapbasename] if not sigma and self.pcapbasename in sigmapertrace else \
+ 0.9 if not sigma else sigma
+ self.tokenizer = tokenizer
+ self.filter = filtering
+
+ def configureRefinement(self, refinementCallback:Union[Callable, None] = refinements, forwardComparator=False,
+ **refinementArgs):
+ """
+ optional
+
+ :param refinementCallback: The function to use for refinement.
+ Existing refinements can be found in segmentHandler.
+ :param forwardComparator: If True, the comparator instance is passed to the refinementCallback
+ as additional keyword argument with the key "comparator".
+ :param refinementArgs: kwargs for the refinement function, e. g., reportFolder or collectedSubclusters
+ """
+ self.refinementCallback = refinementCallback
+ self.refinementArgs = refinementArgs
+ self.forwardComparator = forwardComparator
+ if forwardComparator and not self.dissectGroundtruth:
+ raise ValueError("A comparator can only be forwarded to the refinement for evaluation if ground truth "
+ "(see CachedDistances.dissectGroundtruth) is available.")
+
+ def _callRefinement(self):
+ if self.refinementCallback is not None:
+ self.rawSegmentedMessages = self.segmentedMessages
+ if self.forwardComparator:
+ if isinstance(self.refinementArgs, dict):
+ self.refinementArgs["comparator"] = self.comparator
+ else:
+ self.refinementArgs = {"comparator": self.comparator}
+ if self.refinementCallback.__code__.co_argcount > 1: # not counting kwargs!
+ # assume the second argument is expected to be a distance calculator
+ chainedSegments = list(chain.from_iterable(self.segmentedMessages))
+ print("Refinement: Calculate distance for {} segments...".format(len(chainedSegments)))
+ if len(chainedSegments) ** 2 > MemmapDC.maxMemMatrix:
+ refinementDC = MemmapDC(chainedSegments)
+ else:
+ refinementDC = DelegatingDC(chainedSegments)
+ self.segmentedMessages = self.refinementCallback(self.segmentedMessages, refinementDC,
+ **self.refinementArgs)
+ else:
+ self.segmentedMessages = self.refinementCallback(self.segmentedMessages, **self.refinementArgs)
+
+ def _calc(self):
+ """
+ dissect and label messages
+ """
+ print("Load messages from {}...".format(self.pcapName))
+ self.specimens = SpecimenLoader(self.pcapfilename, self.layer, relativeToIP=self.relativeToIP)
+ if self.dissectGroundtruth:
+ self.comparator = MessageComparator(
+ self.specimens, layer=self.layer, relativeToIP=self.relativeToIP, debug=self.debug)
+ else:
+ self.comparator = BaseComparator(
+ self.specimens, layer=self.layer, relativeToIP=self.relativeToIP, debug=self.debug)
print("Segmenting messages...", end=' ')
segmentationTime = time.time()
# select tokenizer by command line parameter
- if tokenizer == "tshark":
- # 1. segment messages according to true fields from the labels
- segmentedMessages = annotateFieldTypes(analyzerType, analysisArgs, comparator)
- elif tokenizer == "4bytesfixed":
+ if self.tokenizer == "tshark":
+ if isinstance(self.comparator, MessageComparator):
+ # 1. segment messages according to true fields from the labels
+ self.segmentedMessages = annotateFieldTypes(self.analyzerType, self.analysisArgs, self.comparator)
+ else:
+ raise ValueError("tshark tokenizer can only be used with existing (Wireshark) dissector "
+ "and CachedDistances.dissectGroundtruth set to True.")
+ elif self.tokenizer == "4bytesfixed":
# 2. segment messages into fixed size chunks for testing
- segmentedMessages = segmentsFixed(4, comparator, analyzerType, analysisArgs)
- elif tokenizer == "nemesys":
+ self.segmentedMessages = fixedlengthSegmenter(4, self.specimens, self.analyzerType, self.analysisArgs)
+ elif self.tokenizer in ["nemesys", "nemesysle"]:
# 3. segment messages by NEMESYS
- segmentsPerMsg = bcDeltaGaussMessageSegmentation(specimens, sigma)
+ segmentsPerMsg = bcDeltaGaussMessageSegmentation(self.specimens, self.sigma)
# get analyzer requested by analyzerType/analysisArgs
- segmentedMessages = [[
- MessageSegment(MessageAnalyzer.findExistingAnalysis(
- analyzerType, MessageAnalyzer.U_BYTE, seg.message, analysisArgs), seg.offset, seg.length)
- for seg in msg] for msg in segmentsPerMsg]
-
- if refinementCallback is not None:
- if refinementCallback.__code__.co_argcount > 1:
- # assume the second argument is expected to be a distance calculator
- chainedSegments = list(chain.from_iterable(segmentedMessages))
- print("Refinement: Calculate distance for {} segments...".format(len(chainedSegments)))
- if len(chainedSegments)**2 > MemmapDC.maxMemMatrix:
- refinementDC = MemmapDC(chainedSegments)
- else:
- refinementDC = DelegatingDC(chainedSegments)
- segmentedMessages = refinementCallback(segmentedMessages, refinementDC)
- else:
- segmentedMessages = refinementCallback(segmentedMessages)
-
- # segments = list(chain.from_iterable(segmentedMessages))
+ self.segmentedMessages = MessageAnalyzer.convertAnalyzers(
+ segmentsPerMsg, self.analyzerType, self.analysisArgs)
+ self._callRefinement()
- segmentationTime = time.time() - segmentationTime
+ self.segmentationTime = time.time() - segmentationTime
print("done.")
- if filterTrivial:
- # noinspection PyUnboundLocalVariable
- chainedSegments = [seg for seg in chain.from_iterable(segmentedMessages) if
+ if self.filter:
+ chainedSegments = [seg for seg in chain.from_iterable(self.segmentedMessages) if
seg.length > 1 and set(seg.values) != {0}]
else:
- # noinspection PyUnboundLocalVariable
- chainedSegments = list(chain.from_iterable(segmentedMessages))
+ chainedSegments = list(chain.from_iterable(self.segmentedMessages))
print("Calculate distance for {} segments...".format(len(chainedSegments)))
# dc = DistanceCalculator(chainedSegments, reliefFactor=0.33) # Pairwise similarity of segments: dc.distanceMatrix
dist_calc_segmentsTime = time.time()
if len(chainedSegments) ** 2 > MemmapDC.maxMemMatrix:
- dc = MemmapDC(chainedSegments)
+ self.dc = MemmapDC(chainedSegments)
else:
- dc = DelegatingDC(chainedSegments)
- assert chainedSegments == dc.rawSegments
- dist_calc_segmentsTime = time.time() - dist_calc_segmentsTime
+ self.dc = DelegatingDC(chainedSegments)
+ self.dist_calc_segmentsTime = time.time() - dist_calc_segmentsTime
try:
- with open(dccachefn, 'wb') as f:
- pickle.dump((segmentedMessages, comparator, dc), f, pickle.HIGHEST_PROTOCOL)
- except MemoryError as e:
- print("DC could not be cached due to a MemoryError. Removing", dccachefn, "and continuing.")
- os.remove(dccachefn)
- else:
- print("Load distances from cache file {}".format(dccachefn))
- with open(dccachefn, 'rb') as f:
- segmentedMessages, comparator, dc = pickle.load(f)
- if not (isinstance(comparator, MessageComparator)
- and isinstance(dc, DistanceCalculator)):
+ with open(self.dccachefn, 'wb') as f:
+ pickle.dump((self.segmentedMessages, self.comparator, self.dc), f, pickle.HIGHEST_PROTOCOL)
+ print("Write distances to cache file {}".format(self.dccachefn))
+ except MemoryError:
+ print("DC could not be cached due to a MemoryError. Removing", self.dccachefn, "and continuing.")
+ os.remove(self.dccachefn)
+
+ def _load(self):
+ print("Load distances from cache file {}".format(self.dccachefn))
+ with open(self.dccachefn, 'rb') as f:
+ self.segmentedMessages, self.comparator, self.dc = pickle.load(f)
+ if not (isinstance(self.comparator, BaseComparator)
+ and isinstance(self.dc, DistanceCalculator)):
print('Loading of cached distances failed.')
exit(10)
- specimens = comparator.specimens
- # chainedSegments = list(chain.from_iterable(segmentedMessages))
- segmentationTime, dist_calc_segmentsTime = None, None
+ if not isinstance(self.comparator, MessageComparator):
+ print("Loaded without ground truth from dissector.")
+ self.specimens = self.comparator.specimens
+ self.segmentationTime, self.dist_calc_segmentsTime = None, None
+ self.isLoaded = True
+
+ def get(self):
+ assert self.analysisTitle is not None
+ assert self.tokenizer is not None
+ assert self.pcapName is not None
+ assert self.tokenizer != "nemesys" or self.sigma is not None
+
+ tokenparm = self.tokenizer if self.tokenizer[:7] != "nemesys" else \
+ "{}{:.0f}".format(self.tokenizer, self.sigma * 10)
+ refine = (self.refinementCallback.__name__ if self.refinementCallback is not None else "raw") \
+ + ("le" if self.refinementArgs is not None
+ and "littleEndian" in self.refinementArgs and self.refinementArgs["littleEndian"] else "")
+ fnprefix = "cache-dc"
+ if isinstance(self.comparator, MessageComparator):
+ fnprefix += "-nogt"
+ self.dccachefn = os.path.join(cacheFolder, '{}-{}-{}-{}-{}-{}-{}.{}'.format(
+ fnprefix, self.analysisTitle, tokenparm, "filtered" if self.filter else "all",
+ refine,
+ self.pcapName,
+ "" if self.layer == 2 and self.relativeToIP == True
+ else str(self.layer) + "reltoIP" if self.relativeToIP else "",
+ 'ddc'))
+ if self.disableCache or not os.path.exists(self.dccachefn):
+ self._calc()
+ else:
+ self._load()
+
+def cacheAndLoadDC(pcapfilename: str, analysisTitle: str, tokenizer: str, debug: bool,
+ analyzerType: type, analysisArgs: Tuple=None, sigma: float=None, filtering=False,
+ refinementCallback:Union[Callable, None] = refinements,
+ disableCache=False, layer=2, relativeToIP=True) \
+ -> Tuple[SpecimenLoader, BaseComparator, List[Tuple[MessageSegment]], DistanceCalculator, Optional[float],
+ Optional[float]]:
+ """
+ Legacy:
+ Wrapper around class CachedDistances for backwards compatibility:
+ cache or load the DistanceCalculator to or from the filesystem
+
+ >>> dc = DistanceCalculator()
+ >>> chainedSegments = dc.rawSegments
+
+ :param analyzerType: Unused
+ :param filtering: Filter out **one-byte** segments and such, just consisting of **zeros**.
+ :param disableCache: When experimenting with distances manipulation, deactivate caching!
+ :return:
+ """
+ fromCache = CachedDistances(pcapfilename, analysisTitle, layer, relativeToIP)
+ fromCache.disableCache = disableCache
+ fromCache.debug = debug
+ fromCache.configureAnalysis(*analysisArgs)
+ fromCache.configureTokenizer(tokenizer, sigma, filtering)
+ fromCache.configureRefinement(refinementCallback)
+ fromCache.get()
+
+ assert fromCache.specimens is not None
+ assert fromCache.comparator is not None
+ assert fromCache.segmentedMessages is not None
+ assert fromCache.dc is not None
+
+ return fromCache.specimens, fromCache.comparator, fromCache.segmentedMessages, fromCache.dc, \
+ fromCache.segmentationTime, fromCache.dist_calc_segmentsTime
+
- return specimens, comparator, segmentedMessages, dc, segmentationTime, dist_calc_segmentsTime
def resolveTemplates2Segments(segments: Iterable[AbstractSegment]):
@@ -711,3 +576,275 @@ def resolveTemplates2Segments(segments: Iterable[AbstractSegment]):
+class StartupFilecheck(object):
+ def __init__(self, pcapfilename: str, reportFullPath: str=None):
+ if not isfile(pcapfilename):
+ print('File not found:', pcapfilename)
+ exit(1)
+ self.pcapfilename = pcapfilename
+ self.pcapbasename = basename(pcapfilename)
+ self.pcapstrippedname = splitext(self.pcapbasename)[0]
+ print("\n\nTrace:", self.pcapbasename)
+
+ if reportFullPath is None:
+ self.reportFullPath = join(reportFolder, self.pcapstrippedname)
+ """A path name that is inside the report folder and reflects the pcap base name without extension."""
+ else:
+ self.reportFullPath = reportFullPath
+ """A path name that is inside the report folder and reflects the pcap base name without extension."""
+ if not exists(self.reportFullPath):
+ os.makedirs(self.reportFullPath)
+ elif isdir(self.reportFullPath):
+ print("Using existing ", self.reportFullPath, " as report folder.")
+ else:
+ print("Path that should be used as report folder is an existing file. Aborting.")
+ exit(1)
+
+ self.timestamp = time.time()
+ self.timeformated = time.strftime("%Y%m%d-%H%M%S", time.gmtime(self.timestamp))
+
+ def reportWithTimestamp(self, inferenceTitle: str=None):
+ if inferenceTitle is None:
+ reportPathTS = join(self.reportFullPath, self.timeformated)
+ else:
+ reportPathTS = join(self.reportFullPath, "{}_{}".format(inferenceTitle, self.timeformated))
+ os.makedirs(reportPathTS, exist_ok=True)
+ return reportPathTS
+
+ def writeReportMetadata(self, dcCacheFile: str=None, scriptRuntime: float=None):
+ import sys, git
+ if not exists(self.reportFullPath):
+ raise FileNotFoundError("Report folder must be existing. It does not.")
+ repo = git.Repo(search_parent_directories=True)
+ timeformat = "%d.%m.%Y %H:%M:%S %Z"
+
+ lines = {
+ "fullCommandLine": " ".join(sys.argv),
+ "absolutepcapfilename": abspath(self.pcapfilename),
+ "dcCacheFile": "n/a" if dcCacheFile is None else dcCacheFile,
+ "gitCommit": repo.head.object.hexsha + f" ({repo.active_branch})",
+ "currentTime": time.strftime(timeformat),
+ "scriptRuntime": "{:.3f} s".format(time.time() - self.timestamp
+ if scriptRuntime is None else scriptRuntime),
+ "host": os.uname().nodename
+ }
+
+ with open(join(self.reportFullPath, "run-metadata.md"), "a") as md:
+ md.write("# Report Metadata\n\n")
+ md.writelines(f"{k}: {v}\n\n" for k, v in lines.items())
+
+
+class TrueOverlays(object):
+ """
+ Count and the amount of (falsely) inferred boundaries in the scope of each true field.
+ """
+ def __init__(self, trueSegments: Dict[str, Sequence[MessageSegment]],
+ inferredSegments: List[Sequence[MessageSegment]], comparator: MessageComparator, minLen=3):
+ self.trueSegments = trueSegments
+ self.comparator = comparator
+ self.inferredSegments = inferredSegments
+ # at least minLen bytes long
+ self.keys4longer = [k for k, segs in self.trueSegments.items() if any(len(s) >= minLen for s in segs)]
+ self.trueNamesOverlay = defaultdict(defaultdict)
+ self._classifyTrueNamesOverlays()
+ self.trueNamesOverlayCounters = self._trueOverlayCounters()
+ # maxoverlaysegcount = max(cnt.keys() for cnt in self.trueNamesOverlayCounters.values())
+
+ def _classifyTrueNamesOverlays(self):
+ # sort the inferred segments per true segment by type of overlapping
+ for k4l in self.keys4longer:
+ for seg in self.trueSegments[k4l]:
+ inf4msg = inferred4segment(seg, self.inferredSegments)
+ # inferred field overlapping
+ overlapping = [i4m for i4m in inf4msg if isOverlapping(seg, i4m)]
+ # true and inferred fields match exactly
+ # .. t .. t ..
+ # .. i .. i ..
+ if len(overlapping) == 1 and \
+ seg.offset == overlapping[0].offset and seg.nextOffset == overlapping[0].nextOffset:
+ self.trueNamesOverlay[k4l][seg] = 0
+ # overspecific inference: true field overlaps by multiple inferred segments
+ # .. t ......... t ..
+ # .. i .. i .. i ..
+ elif len(overlapping) > 1:
+ self.trueNamesOverlay[k4l][seg] = len(overlapping)
+ # underspecific inference: true field is only substring of an inferred
+ # .. t .. t ..
+ # .. i ......... i ..
+ else: # len(overlapping) == 1 and (seg.offset < overlapping[0].offset or seg.nextOffset > overlapping[0].nextOffset)
+ self.trueNamesOverlay[k4l][seg] = -1
+
+ def _trueOverlayCounters(self):
+ # amount of (falsely) inferred boundaries in the scope of each true field per true field name.
+ return {fname: Counter(segcnt.values()) for fname, segcnt in
+ self.trueNamesOverlay.items()} # type: Dict[str, Counter]
+
+ _cutoff = 10
+ _cntheaders = ["Field name / inf per true", "min len", "max len", "sum", "all nulls",
+ "underspecific", "exact"] \
+ + list(range(2, _cutoff)) + [f"> {_cutoff-1} (segments too many)"]
+
+ def _cnttable(self):
+ return [[fname,
+ min(len(s) for s in self.trueSegments[fname]), max(len(s) for s in self.trueSegments[fname]),
+ len(self.trueSegments[fname]),
+ sum(set(s.values) == {0} for s in self.trueSegments[fname])
+ ] + [
+ cnt[c] if c in cnt else None for c in [-1, 0] + list(range(2, TrueOverlays._cutoff))
+ ] + [sum(c for k, c in cnt.items() if k >= TrueOverlays._cutoff)]
+ for fname, cnt in self.trueNamesOverlayCounters.items()]
+
+ def __repr__(self):
+ """
+ Arrange the data of trueNamesOverlayCounters in a table like this:
+
+ # Field name / inf per true | underspecific | exact | 1 | 2 | 3 | 4 | ... | > 10 | (segments too many)
+ # wlan.fixed.ftm.param.delim2 | 50 | 21 | ...
+ # wlan.tag.oui | | 3 | ...
+ # wlan.fixed.ftm_toa | ...
+
+ :return: Visual table
+ """
+ cnttable = self._cnttable()
+ return tabulate(cnttable, headers=TrueOverlays._cntheaders)
+
+ def toCSV(self, folder: str):
+ import csv
+ csvPath = join(folder, type(self).__name__ + ".csv")
+ if exists(csvPath):
+ raise FileExistsError("Will not overwrite existing file " + csvPath)
+ with open(csvPath, 'w') as csvFile:
+ cntcsv = csv.writer(csvFile) # type: csv.writer
+ cntcsv.writerow(self._cntheaders)
+ cntcsv.writerows(self._cnttable())
+
+ @staticmethod
+ def uniqSort(someSegs: Dict[str, Sequence[MessageSegment]]):
+ # remove double values by adding into dicts
+ uniqSegs = {fname: {s.values: s for s in segs} for fname, segs in someSegs.items()}
+ sortedSegs = {fname: sorted(segs.values(), key=lambda s: s.values) for fname, segs in uniqSegs.items()}
+ return sortedSegs
+
+ def filterUnderspecific(self):
+ """
+ true segments that are not all nulls and underspecific / sorted by segment value
+ :return:
+ """
+ filteredSegs = {fname: (s for s, c in segolc.items() if set(s.values) != {0} and c < 0)
+ for fname, segolc in self.trueNamesOverlay.items()}
+ return TrueOverlays.uniqSort(filteredSegs)
+
+ def filterOverspecific(self, segCnt: int=3):
+ # true segments that are not all nulls and (+2/+3/*) inferred
+ filteredSegs = {fname: (s for s, c in segolc.items() if set(s.values) != {0} and c == segCnt)
+ for fname, segolc in self.trueNamesOverlay.items()}
+ return TrueOverlays.uniqSort(filteredSegs)
+
+ def printSegmentContexts(self, trueSegments: Dict[str, Sequence[MessageSegment]], maxlines=10):
+ """
+ print the selected fields for reference
+ :param trueSegments:
+ :param maxlines: Limit output per field category to this number, no limit if <= 0
+ """
+ for lab, segs in trueSegments.items():
+ if len(segs) > 0:
+ print("\n" "# #", lab)
+ if maxlines > 0:
+ truncatedSegs = segs[:maxlines]
+ else:
+ truncatedSegs = segs
+ markSegNearMatch(truncatedSegs, self.inferredSegments, self.comparator, 3)
+ # for seg in segs:
+ # # inf4msg = inferred4segment(seg, self.inferredSegments)
+ # # overlapping = [i4m for i4m in inf4msg if isOverlapping(seg, i4m)]
+ # # # print(overlapping)
+ # # if len(overlapping) == 1:
+ # # # print("match or inferred larger. continuing")
+ # # continue
+ # markSegNearMatch(seg, self.inferredSegments, 3)
+
+
+class TrueDataTypeOverlays(TrueOverlays):
+ def __init__(self, trueSegmentedMessages: Dict[AbstractMessage, Tuple[TypedSegment]],
+ inferredSegments: List[Sequence[MessageSegment]], comparator: MessageComparator, minLen: int = 3):
+ # all true fields of one data type
+ trueDataTypes = defaultdict(list)
+ for seg in (seg for msgsegs in trueSegmentedMessages.values() for seg in msgsegs):
+ trueDataTypes[seg.fieldtype].append(seg)
+ super().__init__(trueDataTypes, inferredSegments, comparator, minLen)
+
+
+class TrueFieldNameOverlays(TrueOverlays):
+ def __init__(self, trueSegmentedMessages: Dict[AbstractMessage, Tuple[TypedSegment]],
+ inferredSegments: List[Sequence[MessageSegment]], comparator: MessageComparator, minLen: int = 3):
+ # all true fields of one field type (tshark name)
+ trueFieldNames = defaultdict(list)
+ for absmsg, msgsegs in trueSegmentedMessages.items():
+ pm = comparator.parsedMessages[comparator.specimens.messagePool[absmsg]]
+ fnames = pm.getFieldNames()
+ # here we assume that the fnames and msgsegs are in the same order (and have the same amount),
+ # which should be the case if ParsedMessage works correctly and trueSegmentedMessages was not tampered with.
+ assert len(msgsegs) == len(fnames)
+ for seg, fna in zip(msgsegs, fnames):
+ trueFieldNames[fna].append(seg)
+ super().__init__(trueFieldNames, inferredSegments, comparator, minLen)
+
+
+class TitleBuilder(object):
+ """Builds readable strings from the configuration parameters of the analysis."""
+ def __init__(self, tokenizer, refinement = None, sigma = None, clusterer = None):
+ self.tokenizer = tokenizer
+ self.refinement = refinement
+ self._clusterer = clusterer
+ self.postProcess = None # multiple. to be adjusted dynamically
+
+ self.sigma = sigma
+
+ @property
+ def tokenParams(self):
+ return f"sigma {self.sigma}" if self.tokenizer[:7] == "nemesys" else None
+
+ @property
+ def clusterer(self):
+ return type(self._clusterer).__name__
+
+ @clusterer.setter
+ def clusterer(self, val):
+ self._clusterer = val
+
+ @property
+ def clusterParams(self):
+ from sklearn.cluster import DBSCAN
+ from hdbscan import HDBSCAN
+ if isinstance(self._clusterer, (DBSCAN)):
+ return f"eps {self._clusterer.eps:.3f} ms {self._clusterer.min_samples}"
+ elif isinstance(self._clusterer, (HDBSCAN)):
+ return f"mcs {self._clusterer.min_cluster_size} ms {self._clusterer.min_samples}"
+
+ @property
+ def plotTitle(self):
+ plotTitle = self.tokenizer
+ if self.tokenParams is not None: plotTitle += "-" + self.tokenParams
+ if self.refinement is not None: plotTitle += "-" + self.refinement
+ plotTitle += " " + self.clusterer + " " + self.clusterParams
+ if self.postProcess is not None: plotTitle += " " + self.postProcess
+ return plotTitle
+
+ @property
+ def dict(self):
+ return {
+ "tokenizer": self.tokenizer,
+ "tokenParams": self.tokenParams,
+ "refinement": self.refinement,
+ "clusterer": self.clusterer,
+ "clusterParams": self.clusterParams,
+ "postProcess": self.postProcess
+ }
+
+
+class TitleBuilderSens(TitleBuilder):
+ """include Sensitivitiy from clusterer in title"""
+ @property
+ def clusterParams(self):
+ return super().clusterParams
+
diff --git a/src/nemere/utils/loader.py b/src/nemere/utils/loader.py
index 8a227ebd..36f8f1b2 100644
--- a/src/nemere/utils/loader.py
+++ b/src/nemere/utils/loader.py
@@ -2,9 +2,17 @@
from os.path import isfile
from collections import OrderedDict
+from scapy.layers.dot11 import RadioTap, Dot11, Dot11FCS
+from scapy.packet import Packet, Raw
+from scapy.utils import rdpcap
+import pcapy
+
+from netzob.Common.NetzobException import NetzobImportException
+from netzob.Common.Utils.SortedTypedList import SortedTypedList
from netzob.Import.PCAPImporter.PCAPImporter import PCAPImporter
from netzob.Model.Vocabulary.Messages.AbstractMessage import AbstractMessage
from netzob.Model.Vocabulary.Messages.RawMessage import RawMessage
+from netzob.Model.Vocabulary.Messages.L2NetworkMessage import L2NetworkMessage
from netzob.Model.Vocabulary.Messages.L4NetworkMessage import L4NetworkMessage
from nemere.validation.messageParser import ParsingConstants
@@ -16,12 +24,13 @@ class BaseLoader(object):
Especially useful for on-the-fly creation of test cases.
"""
- def __init__(self, l5msgs, l1msgs=None):
+ def __init__(self, l5msgs, l1msgs=None, baselayer=None):
"""
Load messages from memory. Base class for other loaders, e. g. loading from PCAP file.
:param l5msgs:
:param l1msgs:
+ :param baselayer: Optionally, set the base layer explicitly.
"""
if not hasattr(self, 'pcapFileName'):
self.pcapFileName = 'from-memory'
@@ -35,28 +44,40 @@ def __init__(self, l5msgs, l1msgs=None):
# TODO replace the above quickfix not to read the file a second time (should we?)
# probably we could use msgs = ParsedMessage.parseMultiple(l1msgs); for m in msgs:
# ... append(RawMessage(m.protocolbytes))
+ self._baselayer = baselayer
def getBaseLayerOfPCAP(self):
"""
see ParsingConstants.LINKTYPES
- :return: Determine lowest encapulating layer of PCAP.
+ :return: Determine lowest encapsulating layer of PCAP.
"""
+ if self._baselayer is not None:
+ return self._baselayer
+
try:
- # looking at just one message should reveal lowest encapulating layer of the whole PCAP
+ # looking at just one message should reveal lowest encapsulating layer of the whole PCAP
al5msg = next(iter(self.messagePool.keys()))
except StopIteration:
raise ValueError('No message could be imported. See previous errors for more details.')
- if al5msg.l2Protocol == 'Ethernet':
- return ParsingConstants.LINKTYPES['ETHERNET']
- elif al5msg.l2Protocol == 'None': # no ethernet
- if al5msg.l3Protocol == 'IP':
- return ParsingConstants.LINKTYPES['RAW_IP'] # IP
+ if isinstance(al5msg, L2NetworkMessage):
+ if al5msg.l2Protocol == 'Ethernet':
+ return ParsingConstants.LINKTYPES['ETHERNET']
+ # for some reason, its not possible to access the class variable "name", so instantiate a dummy object
+ elif al5msg.l2Protocol == Dot11().name or al5msg.l2Protocol == Dot11FCS().name:
+ return ParsingConstants.LINKTYPES['IEEE802_11'] # 802.11
+ elif al5msg.l2Protocol == 'None': # no ethernet
+ if al5msg.l3Protocol == 'IP':
+ return ParsingConstants.LINKTYPES['RAW_IP'] # IP
+ else:
+ raise NotImplementedError("Linktype on layer 3 unknown. Protocol is {}".format(al5msg.l3Protocol))
else:
- raise NotImplementedError("Linktype on layer 3 unknown. Protocol is {}".format(al5msg.l3Protocol))
+ raise NotImplementedError("Linktype on layer 2 unknown. Protocol is {}".format(al5msg.l2Protocol))
else:
- raise NotImplementedError("Linktype on layer 2 unknown. Protocol is {}".format(al5msg.l2Protocol))
+ return ParsingConstants.LINKTYPES['undecoded'] # non-decoded raw trace without link type information
+ def __repr__(self):
+ return type(self).__name__ + ": " + self.pcapFileName + f" on layer {self.getBaseLayerOfPCAP()}"
class SpecimenLoader(BaseLoader):
"""
@@ -84,18 +105,36 @@ def __init__(self, pcap: str, layer:int=-1, relativeToIP:bool=False):
if not isfile(pcap):
raise FileNotFoundError('File not found:', pcap)
self.pcapFileName = pcap
+ absLayer = 2 + layer if relativeToIP else layer
- if layer < 0:
- # read messages at layer 5 for the Netzob inference
- l5msgs = PCAPImporter.readFile(pcap, importLayer=5).values() # type: List[L4NetworkMessage]
- else:
- # read messages at given layer for the Netzob inference
- absLayer = 2 + layer if relativeToIP else layer
- l5msgs = PCAPImporter.readFile(pcap, importLayer=absLayer).values() # type: List[AbstractMessage]
- # read messages as raw for tshark input
- l1msgs = PCAPImporter.readFile(pcap, importLayer=1).values() # type: List[RawMessage]
+ try:
+ if layer < 0:
+ # read messages at layer 5 for the Netzob inference
+ l5msgs = PCAPImporter.readFile(pcap, importLayer=5).values() # type: List[L4NetworkMessage]
+ else:
+ # read messages at given layer for the Netzob inference
+ l5msgs = PCAPImporter.readFile(pcap, importLayer=absLayer).values() # type: List[AbstractMessage]
+ # read messages as raw for tshark input
+ l1msgs = PCAPImporter.readFile(pcap, importLayer=1).values() # type: List[RawMessage]
+ except (NetzobImportException, pcapy.PcapError):
+ importer = ScaPyCAPimporter(self.pcapFileName, absLayer)
+ l5msgs = importer.messages
+ l1msgs = importer.rawMessages
super().__init__(l5msgs, l1msgs)
+ # The int value of some pcapy datalink denotations is different from the tcpdump ones: https://www.tcpdump.org/linktypes.html
+ # http://vpnb.leipzig.freifunk.net:8004/srv2/lede/lede-20171116/build_dir/target-mips_24kc_musl/python-pcapy-0.11.1/pcapy.html#idp8777598240
+ pcapyDatalinkTranslation = {
+ pcapy.DLT_RAW: ParsingConstants.LINKTYPES['RAW_IP']
+ }
+ """Translates pcapy linktype values to tcpdump ones."""
+
+ def getBaseLayerOfPCAP(self):
+ pcap = pcapy.open_offline(self.pcapFileName)
+ dl = pcap.datalink()
+ # Translates pcapy linktype values to tcpdump ones if in dict, otherwise the value is used unchanged
+ return dl if dl not in SpecimenLoader.pcapyDatalinkTranslation else SpecimenLoader.pcapyDatalinkTranslation[dl]
+
@property
def maximumMessageLength(self):
"""
@@ -105,3 +144,61 @@ def maximumMessageLength(self):
return max(len(line.data) for line in self.messagePool.keys())
+class ScaPyCAPimporter(object):
+ def __init__(self, pcapfilename, importLayer=5):
+ # l5msgs = PCAPImporter.readFile(pcap, importLayer=absLayer).values() # type: List[AbstractMessage]
+ self.importLayer = importLayer
+ self.packets = rdpcap(pcapfilename)
+ self._messages = SortedTypedList(AbstractMessage)
+ self._rawmessages = SortedTypedList(AbstractMessage)
+
+ for pkt in self.packets: # type: Packet
+ self.packetHandler(pkt)
+
+ @property
+ def messages(self):
+ return self._messages.values()
+
+ @property
+ def rawMessages(self):
+ return self._rawmessages.values()
+
+ def packetHandler(self, packet: Packet):
+ epoch = packet.time
+ l1Payload = bytes(packet)
+ if len(l1Payload) == 0:
+ return
+ # Build the RawMessage
+ rawMessage = RawMessage(l1Payload, epoch, source=None, destination=None)
+
+ if isinstance(packet, RadioTap):
+ # lift layer to Dot11 if there is a RadioTap dummy frame
+ packet = packet.payload
+ if self.importLayer == 2:
+ (l2Proto, l2SrcAddr, l2DstAddr, l2Payload) = self.__decodeLayer2(packet)
+ if len(l2Payload) == 0:
+ return
+ # Build the L2NetworkMessage
+ l2Message = L2NetworkMessage(l2Payload, epoch, l2Proto, l2SrcAddr, l2DstAddr)
+ self._messages.add(l2Message)
+ self._rawmessages.add(rawMessage)
+ else:
+ # Use Netzob's PCAPImporter if layer 2 is not WLAN
+ raise NetzobImportException("PCAP", "Unsupported import layer. Currently only handles layer 2.",
+ PCAPImporter.INVALID_LAYER2)
+
+ def __decodeLayer2(self, packet: Packet):
+ """Internal method that parses the specified header and extracts
+ layer2 related proprieties."""
+ l2Proto = packet.name
+ if isinstance(packet, Raw):
+ print("Ignoring undecoded packet with values:", bytes(packet).hex())
+ return l2Proto, None, None, ""
+ if isinstance(packet, Dot11):
+ l2DstAddr = packet.fields['addr1'] # receiver address, alt: packet.fields['addr3'] destination address
+ l2SrcAddr = packet.fields['addr2'] # transmitter address, alt: packet.fields['addr4'] source address
+ else:
+ raise NetzobImportException("NEMERE_PCAP", "Unsupported layer 2 protocol " + l2Proto,
+ PCAPImporter.INVALID_LAYER2)
+ l2Payload = bytes(packet.payload)
+ return l2Proto, l2SrcAddr, l2DstAddr, l2Payload
\ No newline at end of file
diff --git a/src/nemere/utils/reportWriter.py b/src/nemere/utils/reportWriter.py
new file mode 100644
index 00000000..1d678703
--- /dev/null
+++ b/src/nemere/utils/reportWriter.py
@@ -0,0 +1,662 @@
+"""
+Write Format Match Score report for a list of analysed messages.
+"""
+
+import os
+import csv
+from abc import ABC, abstractmethod
+
+import numpy
+from typing import Dict, Tuple, Iterable, TypeVar, Hashable, List, Union, Any, Sequence
+from os.path import isdir, splitext, basename, join
+from itertools import chain
+from collections import Counter, defaultdict, OrderedDict
+
+from netzob.Model.Vocabulary.Messages.AbstractMessage import AbstractMessage
+
+from nemere.inference.segments import AbstractSegment, TypedSegment, MessageSegment
+from nemere.inference.templates import Template, TypedTemplate
+from nemere.utils.evaluationHelpers import StartupFilecheck, reportFolder
+from nemere.utils.loader import SpecimenLoader
+from nemere.validation.dissectorMatcher import FormatMatchScore, MessageComparator
+
+
+def calcScoreStats(scores: Iterable[float]) -> Tuple[float, float, float, float, float]:
+ """
+ :param scores: An Iterable of FMS values.
+ :return: min, meankey, max, mediankey, standard deviation of the scores,
+ where meankey is the value in scores closest to the mean of its values,
+ and median is the value in scores closest to the mean of its values.
+ """
+ scores = sorted(scores)
+ fmsmin, fmsmean, fmsmax, fmsmedian, fmsstd = \
+ numpy.min(scores), numpy.mean(scores), numpy.max(scores), numpy.median(scores), numpy.std(scores)
+ # get quality key closest to mean
+ fmsmeankey = 1
+ if len(scores) > 2:
+ for b,a in zip(scores[:-1], scores[1:]):
+ if a < fmsmean:
+ continue
+ fmsmeankey = b if fmsmean - b < a - fmsmean else a
+ break
+ return float(fmsmin), float(fmsmeankey), float(fmsmax), float(fmsmedian), float(fmsstd)
+
+
+def getMinMeanMaxFMS(scores: Iterable[float]) -> Tuple[float, float, float]:
+ """
+ :param scores: An Iterable of FMS values.
+ :return: min, meankey, and max of the scores,
+ where meankey is the value in scores closest to the means of its values.
+ """
+ return calcScoreStats(scores)[:3]
+
+
+def countMatches(quality: Iterable[FormatMatchScore]):
+ """
+ :param quality: List of FormatMatchScores
+ :return: count of exact matches, off-by-one near matches, off-by-more-than-one matches
+ """
+ exactcount = 0
+ offbyonecount = 0
+ offbymorecount = 0
+ for fms in quality: # type: FormatMatchScore
+ exactcount += fms.exactCount
+ offbyonecount += sum(1 for truf, inff in fms.nearMatches.items() if abs(truf - inff) == 1)
+ offbymorecount += sum(1 for truf, inff in fms.nearMatches.items() if abs(truf - inff) > 1)
+ return exactcount, offbyonecount, offbymorecount
+
+
+def writeReport(formatmatchmetrics: Dict[AbstractMessage, FormatMatchScore],
+ runtime: float,
+ specimens: SpecimenLoader, comparator: MessageComparator,
+ inferenceTitle: str, folder="reports"):
+
+ if not isdir(folder):
+ raise NotADirectoryError("The reports folder {} is not a directory. Reports cannot be written there.".format(
+ folder))
+ print('Write report to ' + folder)
+
+ # write Format Match Score and Metrics to csv
+ with open(os.path.join(folder, 'FormatMatchMetrics.csv'), 'w') as csvfile:
+ fmmcsv = csv.writer(csvfile)
+ fmmcsv.writerow(["Message", "Score", 'I', 'M', 'N', 'S', 'MG', 'SP'])
+ fmmcsv.writerows( [
+ (message.data.hex(), fms.score,
+ fms.inferredCount, fms.exactCount, fms.nearCount, fms.specificy, fms.matchGain, fms.specificyPenalty)
+ for message, fms in formatmatchmetrics.items()] )
+
+ scoreStats = calcScoreStats([q.score for q in formatmatchmetrics.values()])
+ matchCounts = countMatches(formatmatchmetrics.values())
+
+ with open(os.path.join(folder, 'ScoreStatistics.csv'), 'w') as csvfile:
+ fmmcsv = csv.writer(csvfile)
+ fmmcsv.writerow(["inference", "min", "mean", "max", "median", "std",
+ "exactcount", "offbyonecount", "offbymorecount", "runtime"])
+ fmmcsv.writerow( [ inferenceTitle,
+ *scoreStats, *matchCounts,
+ runtime] )
+
+ # write Symbols to csvs
+ multipleSymbolCSVs = False
+ if multipleSymbolCSVs:
+ for cnt, symbol in enumerate( # by the set comprehension,
+ { quality.symbol # remove identical symbols due to multiple formats
+ for quality
+ in formatmatchmetrics.values() } ):
+ fileNameS = 'Symbol_{:s}_{:d}'.format(symbol.name, cnt)
+ with open(os.path.join(folder, fileNameS + '.csv'), 'w') as csvfile:
+ symbolcsv = csv.writer(csvfile)
+ symbolcsv.writerow([field.name for field in symbol.fields])
+ symbolcsv.writerows([val.hex() for val in msg] for msg in symbol.getCells())
+ else:
+ fileNameS = 'Symbols'
+ with open(os.path.join(folder, fileNameS + '.csv'), 'w') as csvfile:
+ symbolcsv = csv.writer(csvfile)
+ msgcells = chain.from_iterable([sym.getCells() for sym in # unique symbols by set
+ {fms.symbol for fms in formatmatchmetrics.values()}])
+ symbolcsv.writerows(
+ [val.hex() for val in msg] for msg in msgcells
+ )
+
+ # # write tshark-dissection to csv
+ # # currently only unique formats. For a specific trace a baseline could be determined
+ # # by a one time run of per ParsedMessage
+ # with open(os.path.join(reportFolder, 'tshark-dissections.csv'), 'w') as csvfile:
+ # formatscsv = csv.writer(csvfile)
+ # revmsg = {l2m: l5m for l5m, l2m in specimens.messagePool.items()} # get L5 messages for the L2 in tformats
+ # formatscsv.writerows([(revmsg[l2m].data.hex(), f) for l2m, f in tformats.items()])
+
+
+ # FMS : Symbol (we just need one example for each score, so we do not care overwriting multiple identical ones)
+ score2symbol = {fms.score: fms.symbol for fms in formatmatchmetrics.values()}
+
+ # symsMinMeanMax = [score2symbol[mmm] for mmm in scoreStats[:3]]
+ # add some context symbols
+ scoreSorted = sorted(score2symbol.keys())
+ meanI = scoreSorted.index(scoreStats[1])
+ scoreSelect = scoreSorted[-3:] + scoreSorted[meanI:meanI+3] + scoreSorted[:3]
+ symsMinMeanMax = [score2symbol[mmm] for mmm in scoreSelect]
+ tikzcode = comparator.tprintInterleaved(symsMinMeanMax)
+
+ # write Format Match Score and Metrics to csv
+ with open(join(folder, 'example-inference-minmeanmax.tikz'), 'w') as tikzfile:
+ tikzfile.write(tikzcode)
+
+
+def writeSegmentedMessages2CSV(segmentsPerMsg: Sequence[Sequence[MessageSegment]], folder="reports"):
+ import csv
+ fileNameS = 'SegmentedMessages'
+ with open(os.path.join(folder, fileNameS + '.csv'), 'w') as csvfile:
+ symbolcsv = csv.writer(csvfile)
+ symbolcsv.writerows(
+ [seg.bytes.hex() for seg in msg] for msg in segmentsPerMsg
+ )
+
+
+Element = TypeVar('Element', AbstractMessage, AbstractSegment)
+class Report(ABC):
+ statsFile = "statistics"
+
+ def __init__(self, groundtruth, pcap: Union[str, StartupFilecheck], reportPath: str=None):
+ """
+ :param groundtruth: Lookup for Segment : true type string
+ :param pcap: Reference to the PCAP file to report for.
+ """
+ self.groundtruth = groundtruth
+ self.pcap = pcap
+ self.reportPath = reportPath if reportPath is not None else reportFolder
+ if not isdir(self.reportPath):
+ raise FileNotFoundError(f"The report folder {self.reportPath} needs to exist. It does not. Aborting.")
+ self.runtitle = None
+
+ @abstractmethod
+ def write(self, inference, runtitle: Union[str, Dict]):
+ raise NotImplementedError()
+
+class ClusteringReport(Report, ABC):
+ """
+ Calculate conciseness, correctness = precision, and recall for the given clusters compared to some groundtruth.
+ Applicable to clusters of AbstractMessage or AbstractSegment elements.
+ """
+ messagetypeStatsFile = None
+ segmenttypeStatsFile = None
+
+ @abstractmethod
+ def write(self, clusters: Dict[Hashable, List[Element]], runtitle: Union[str, Dict]):
+ """
+ :param clusters: clusterlabel : List of Segments (not Templates!)
+ :param runtitle: Label to identify the inference run with, e. g.
+ "{}-{}-eps={:.2f}-min_samples={}-split".format(tokenizer,
+ type(clusterer).__name__, clusterer.eps, clusterer.min_samples)
+ """
+ raise NotImplementedError()
+
+ @abstractmethod
+ def _writeCSV(self, runtitle: Union[str, Dict]):
+ """
+ :param runtitle: Label to identify the inference run with, e. g.
+ "{}-{}-eps={:.2f}-min_samples={}-split".format(tokenizer,
+ type(clusterer).__name__, clusterer.eps, clusterer.min_samples)
+ """
+ raise NotImplementedError()
+ # TODO in implementing subclasses: add sigma, refinement type,
+ # TODO split runtitle into columns for tokenizer, type(clusterer).__name__, clusterer.eps, clusterer.min_samples
+
+ def _printMessage(self, outfile: str):
+ """Print a user notification about whats happening."""
+ print('Write {} cluster statistics to {}...'.format(
+ "message" if self.statsFile == type(self).messagetypeStatsFile else "segment",
+ outfile))
+
+ @staticmethod
+ def inferenceColumns(inferenceParams: Dict[str, str]):
+ infCols = OrderedDict()
+ infCols["tokenrefine"] = inferenceParams["tokenizer"]
+ if inferenceParams["tokenParams"] is not None: infCols["tokenrefine"] += "-" + inferenceParams["tokenParams"]
+ if inferenceParams["refinement"] is not None: infCols["tokenrefine"] += "-" + inferenceParams["refinement"]
+ infCols["clustering"] = inferenceParams["clusterer"] + "-" + inferenceParams["clusterParams"]
+ infCols["postProcess"] = inferenceParams["postProcess"] if inferenceParams["postProcess"] is not None else ""
+ return infCols
+
+class IndividualClusterReport(ClusteringReport):
+ """from writeIndividualClusterStatistics"""
+ messagetypeStatsFile = "messagetype-cluster-statistics"
+ segmenttypeStatsFile = "segment-cluster-statistics"
+
+ def __init__(self, groundtruth: Dict[Element, str], pcap: Union[str, StartupFilecheck]):
+ super().__init__(groundtruth, pcap)
+ # set filename for CSV depending on element type (message or segment)
+ ClusteringReport.statsFile = IndividualClusterReport.messagetypeStatsFile \
+ if isinstance(next(iter(groundtruth.keys())), AbstractMessage) \
+ else IndividualClusterReport.segmenttypeStatsFile
+ self.hasNoise = False
+ self.noiseTypes, self.ratioNoise, self.numNoise = [None] * 3
+ self.conciseness, self.precisionRecallList = [None] * 2
+ self._additionalColumns = OrderedDict() # type: Dict[str, Dict[Hashable, Any]]
+
+ def addColumn(self, colData: Dict[Hashable, Any], header: str):
+ """add data to a new column. colData contains the cluster label (or "Noise") to determine the row.
+ The order of the columns in the table is the same as they were added here."""
+ self._additionalColumns[header] = colData
+
+ def write(self, clusters: Dict[Hashable, List[Element]], runtitle: Union[str, Dict]):
+ numSegs = 0
+ prList = []
+
+ # handle noise
+ noise = None
+ noisekey = 'Noise' if 'Noise' in clusters else -1 if -1 in clusters else None
+ if noisekey:
+ self.hasNoise = True
+ prList.append(None)
+ noise = clusters[noisekey]
+ clusters = {k: v for k, v in clusters.items() if k != noisekey} # remove the noise
+
+ # cluster statistics
+ numClusters = len(clusters)
+ numTypesOverall = Counter(self.groundtruth.values())
+ numTypes = len(numTypesOverall)
+ self.conciseness = numClusters / numTypes
+ for label, cluster in clusters.items():
+ # we assume correct Tuples of MessageSegments with all objects in one Tuple originating from the same message
+ typeFrequency = Counter([self.groundtruth[element] for element in cluster])
+ mostFreqentType, numMFTinCluster = typeFrequency.most_common(1)[0]
+ numSegsinCuster = len(cluster)
+ numSegs += numSegsinCuster
+
+ precision = numMFTinCluster / numSegsinCuster
+ recall = numMFTinCluster / numTypesOverall[mostFreqentType]
+
+ prList.append((label, mostFreqentType, precision, recall, numSegsinCuster))
+ self.precisionRecallList = prList
+
+ # noise statistics
+ if noise:
+ self.numNoise = len(noise)
+ numSegs += self.numNoise
+ self.ratioNoise = self.numNoise / numSegs
+ self.noiseTypes = {self.groundtruth[element] for element in noise}
+
+ self._writeCSV(runtitle)
+
+ def _writeCSV(self, runtitle: Union[str, Dict]):
+ """Add the report to the appropriate CSV. Appends rows, if the CSV already exists."""
+ outfile = join(self.reportPath, self.statsFile + ".csv")
+ self._printMessage(outfile)
+
+ headers = [ 'trace', 'conciseness', 'cluster_label', 'most_freq_type', 'precision', 'recall', 'cluster_size' ]
+ if not isinstance(runtitle, str):
+ infCols = IndividualClusterReport.inferenceColumns(runtitle)
+ headers = list(infCols.keys()) + headers
+ infParams = list(infCols.values())
+ else:
+ headers = ['run_title'] + headers
+ infParams = [runtitle]
+ headers += list(self._additionalColumns.keys())
+
+ csvWriteHead = False if os.path.exists(outfile) else True
+ with open(outfile, 'a') as csvfile:
+ clStatscsv = csv.writer(csvfile) # type: csv.writer
+ if csvWriteHead:
+ # in "pagetitle": "seg_length", "analysis", "dist_measure", 'min_cluster_size'
+ clStatscsv.writerow(headers)
+ if self.hasNoise:
+ additionalCells = [colData.get("Noise", "") for colData in self._additionalColumns.values()]
+
+ # noinspection PyUnboundLocalVariable
+ clStatscsv.writerow([
+ *infParams,
+ self.pcap.pcapstrippedname if isinstance(self.pcap, StartupFilecheck) else self.pcap,
+ self.conciseness,
+ 'NOISE', str(self.noiseTypes),
+ 'ratio:', self.ratioNoise,
+ self.numNoise] + additionalCells)
+ clStatscsv.writerows([
+ [*infParams,
+ self.pcap.pcapstrippedname if isinstance(self.pcap, StartupFilecheck) else self.pcap,
+ self.conciseness, *pr]
+ + [colData.get(pr[0], "") for colData in self._additionalColumns.values()] # additional columns
+ for pr in self.precisionRecallList if pr is not None
+ ])
+
+class CombinatorialClustersReport(ClusteringReport):
+ """from writeCollectiveClusteringStaticstics"""
+ messagetypeStatsFile = "messagetype-combined-cluster-statistics"
+ segmenttypeStatsFile = "segment-combined-cluster-statistics"
+
+ def __init__(self, groundtruth: Dict[Element, str], pcap: Union[str, StartupFilecheck]):
+ super().__init__(groundtruth, pcap)
+ # set filename for CSV depending on element type (message or segment)
+ ClusteringReport.statsFile = CombinatorialClustersReport.messagetypeStatsFile \
+ if isinstance(next(iter(groundtruth.keys())), AbstractMessage) \
+ else CombinatorialClustersReport.segmenttypeStatsFile
+ self.tp, self.tpfp, self.fn, self.tnfn, self.fn = [None] * 5
+ self.numNoise, self.numUnknown, self.segTotal, self.segUniqu = [None] * 4
+
+ @property
+ def precision(self):
+ # return self.tp / (self.tp + self.fp)
+ if self.tp == 0:
+ return 0
+ return self.tp / self.tpfp
+
+ @property
+ def recall(self):
+ return self.tp / (self.tp + self.fn)
+
+ def write(self, clusters: Dict[Hashable, List[Element]], runtitle: Union[str, Dict], ignoreUnknown=True):
+ """
+ Precision and recall for the whole clustering interpreted as number of draws from pairs of messages.
+
+ For details see: https://nlp.stanford.edu/IR-book/html/htmledition/evaluation-of-clustering-1.html
+ How to calculate the draws is calculated for the Rand index in the document.
+
+ Writes a CSV with tp, fp, fn, tn, pr, rc
+ for (a) all clusters and for (b) clusters that have a size of at least 1/40 of the number of samples/messages.
+
+ 'total segs' and 'unique segs' are including 'unknown' and 'noise'
+ """
+ from collections import Counter
+ from itertools import combinations, chain
+ from scipy.special import binom
+
+ self.segTotal = sum(
+ sum(len(el.baseSegments) if isinstance(el, Template) else 1 for el in cl)
+ for cl in clusters.values())
+ self.segUniqu = sum(len(cl) for cl in clusters.values())
+
+ if ignoreUnknown:
+ unknownKeys = ["[unknown]", "[mixed]"]
+ self.numUnknown = len([gt for gt in self.groundtruth.values() if gt in unknownKeys])
+ clustersTemp = {lab: [el for el in clu if self.groundtruth[el] not in unknownKeys] for lab, clu in
+ clusters.items()}
+ clusters = {lab: elist for lab, elist in clustersTemp.items() if len(elist) > 0}
+ groundtruth = {sg: gt for sg, gt in self.groundtruth.items() if gt not in unknownKeys}
+ else:
+ groundtruth = self.groundtruth
+ self.numUnknown = "n/a"
+
+ noise = []
+ noisekey = 'Noise' if 'Noise' in clusters else -1 if -1 in clusters else None
+ # print("noisekey", noisekey)
+ if noisekey is not None:
+ noise = clusters[noisekey]
+ clusters = {k: v for k, v in clusters.items() if k != noisekey} # remove the noise
+ self.numNoise = len(noise)
+
+ """
+ # # # # # # # #
+ # test case
+ >>> groundtruth = {
+ >>> "x0": "x", "x1": "x", "x2": "x", "x3": "x", "x4": "x", "x5": "x", "x6": "x", "x7": "x",
+ >>> "o0": "o", "o1": "o", "o2": "o", "o3": "o", "o4": "o",
+ >>> "#0": "#", "#1": "#", "#2": "#", "#3": "#"
+ >>> }
+ >>> clusters = { "A": ["x1", "x2", "x3", "x4", "x5", "o1"],
+ >>> "B": ["x6", "o2", "o3", "o4", "o0", "#1"],
+ >>> "C": ["x7", "x0", "#2", "#3", "#0"],
+ >>> }
+ >>> typeFrequencies = [Counter([groundtruth[element] for element in c])
+ for c in clusters.values()]
+ # # # # # # # #
+ """
+
+ # numTypesOverall = Counter(groundtruth[comparator.messages[element[0].message]]
+ # for c in clusters.values() for element in c)
+ numTypesOverall = Counter(groundtruth.values())
+ # number of types per cluster
+ typeFrequencies = [Counter([groundtruth[element] for element in c])
+ for c in clusters.values()]
+ noiseTypes = Counter([groundtruth[element] for element in noise])
+
+ self.tpfp = sum(binom(len(c), 2) for c in clusters.values())
+ self.tp = sum(binom(t, 2) for c in typeFrequencies for t in c.values())
+ self.tnfn = sum(map(lambda n: n[0] * n[1], combinations(
+ (len(c) for c in chain.from_iterable([clusters.values(), [noise]])), 2))) + \
+ sum(binom(noiseTypes[typeName], 2) for typeName, typeTotal in noiseTypes.items())
+ # import IPython; IPython.embed()
+ # fn = sum(((typeTotal - typeCluster[typeName]) * typeCluster[typeName]
+ # for typeCluster in typeFrequencies + [noiseTypes]
+ # for typeName, typeTotal in numTypesOverall.items() if typeName in typeCluster))//2
+ #
+ # # noise handling: consider all elements in noise as false negatives
+ self.fn = sum(((typeTotal - typeCluster[typeName]) * typeCluster[typeName]
+ for typeCluster in typeFrequencies
+ for typeName, typeTotal in numTypesOverall.items() if typeName in typeCluster)) // 2 + \
+ sum((binom(noiseTypes[typeName], 2) +
+ (
+ (typeTotal - noiseTypes[typeName]) * noiseTypes[typeName]
+ ) // 2
+ for typeName, typeTotal in numTypesOverall.items() if typeName in noiseTypes))
+
+ self._writeCSV(runtitle)
+
+ def _writeCSV(self, runtitle: Union[str, Dict]):
+ """Add the report to the appropriate CSV. Appends rows, if the CSV already exists."""
+ outfile = join(self.reportPath, self.statsFile + ".csv")
+ self._printMessage(outfile)
+
+ headers = [ 'trace', 'true positives', 'false positives', 'false negatives', 'true negatives',
+ 'precision', 'recall', 'noise', 'unknown', 'total segs', 'unique segs' ]
+ if not isinstance(runtitle, str):
+ infCols = IndividualClusterReport.inferenceColumns(runtitle)
+ headers = list(infCols.keys()) + headers
+ infParams = list(infCols.values())
+ else:
+ headers = ['run_title'] + headers
+ infParams = [runtitle]
+
+ row = [*infParams,
+ self.pcap.pcapstrippedname if isinstance(self.pcap, StartupFilecheck) else self.pcap,
+ self.tp,
+ self.tpfp - self.tp,
+ self.fn,
+ self.tnfn - self.fn,
+ self.precision,
+ self.recall,
+ self.numNoise,
+ self.numUnknown,
+ self.segTotal,
+ self.segUniqu]
+
+ csvWriteHead = False if os.path.exists(outfile) else True
+ with open(outfile, 'a') as csvfile:
+ clStatscsv = csv.writer(csvfile) # type: csv.writer
+ if csvWriteHead:
+ clStatscsv.writerow(headers)
+ clStatscsv.writerow(row)
+
+
+def plotMultiSegmentLines(segmentGroups: List[Tuple[str, List[Tuple[str, TypedSegment]]]],
+ specimens: SpecimenLoader, pagetitle=None, colorPerLabel=False,
+ typeDict: Dict[str, List[MessageSegment]] = None,
+ isInteractive=False):
+ """
+ This is a not awfully important helper function saving the writing of a few lines of code.
+
+ :param segmentGroups:
+ :param specimens:
+ :param pagetitle:
+ :param colorPerLabel:
+ :param typeDict: dict of types (str-keys: list of segments) present in the segmentGroups
+ :param isInteractive:
+ """
+ from nemere.visualization.multiPlotter import MultiMessagePlotter
+
+ mmp = MultiMessagePlotter(specimens, pagetitle, len(segmentGroups), isInteractive=isInteractive)
+ mmp.plotMultiSegmentLines(segmentGroups, colorPerLabel)
+
+ # TODO Think about replacing this implicit writing of the report CSV to an explicit one by the caller, then, accept
+ # the IndividualClusterReport instance as parameter to retrieve the precision and recall values for the plot.
+ if typeDict:
+ # mapping from each segment in typeDict to the corresponding cluster and true type,
+ # considering representative templates
+ groundtruth = {seg: ft for ft, segs in typeDict.items() for seg in segs}
+ clusters = defaultdict(list)
+ for label, segList in segmentGroups:
+ for tl, seg in segList:
+ if isinstance(seg, Template):
+ clusters[label].extend(seg.baseSegments)
+ else:
+ clusters[label].append(seg)
+
+ # calculate conciseness, correctness = precision, and recall
+ report = IndividualClusterReport(groundtruth, splitext(basename(specimens.pcapFileName))[0])
+ report.write(clusters, pagetitle)
+
+ mmp.textInEachAx(["precision = {:.2f}\n" # correctness
+ "recall = {:.2f}".format(pr[2], pr[3]) if pr else None for pr in report.precisionRecallList])
+
+ mmp.writeOrShowFigure()
+ del mmp
+
+
+class SegmentClusterReport(ClusteringReport):
+ """Clustered elements report for field type clustering with available ground truth to compare to."""
+ statsFile = "segmentclusters"
+
+ def __init__(self, pcap: Union[str, StartupFilecheck], reportPath: str=None):
+ super().__init__(None, pcap, reportPath)
+
+ def write(self, clusters: Dict[str, List[Union[MessageSegment, Template]]], runtitle: Union[str, Dict]=None):
+ self._writeCSV(clusters, runtitle)
+
+ def _writeCSV(self, clusters: Dict[str, List[Union[MessageSegment, Template]]], runtitle: Union[str, Dict]=None):
+ outfile = self._buildOutFilename(runtitle)
+ self._printMessage(outfile)
+
+ with open(outfile, "a") as segfile:
+ segcsv = csv.writer(segfile)
+ segcsv.writerow(["Cluster", "Hex", "Bytes", "occurrence"])
+ for cLabel, segments in clusters.items(): # type: Tuple[str, Union[MessageSegment, Template]]
+ segcsv.writerows({
+ (cLabel, seg.bytes.hex(), seg.bytes, 1 if not isinstance(seg, Template) else len(seg.baseSegments))
+ for seg in segments
+ })
+
+ def _printMessage(self, outfile: str):
+ """Print a user notification about whats happening."""
+ wora = "Append" if os.path.exists(outfile) else "Write"
+ print(f'{wora} field type cluster elements to {outfile}...')
+
+ def _buildOutFilename(self, runtitle: Union[str, Dict]=None):
+ return join(self.reportPath, self.statsFile + (
+ "-" + runtitle if runtitle is not None else ""
+ ) + "-" + (
+ self.pcap.pcapstrippedname if isinstance(self.pcap, StartupFilecheck) else self.pcap
+ ) + ".csv")
+
+class SegmentClusterGroundtruthReport(SegmentClusterReport):
+ """Clustered elements report for field type clustering with available ground truth to compare to."""
+ statsFile = "segmentclusters"
+
+ def __init__(self, comparator: MessageComparator, segments: List[AbstractSegment],
+ pcap: Union[str, StartupFilecheck], reportPath: str=None):
+ """
+
+ :param comparator: The comparator providing the ground truth
+ :param segments: List of segments to write statistics for
+ :param pcap: The filename or StartupFilecheck object pointing to the pcap
+ :param reportPath: If None, automatically determine a path in the report folder using pcap.reportFullPath
+ if available else the globally defined reportFolder
+ """
+ self._comparator = comparator
+ self._segments = segments
+ self._typedMatchSegs, self._typedMatchTemplates = self._matchSegments()
+ reportPath = reportPath if reportPath is not None else pcap.reportFullPath \
+ if isinstance(pcap, StartupFilecheck) else reportFolder
+ super().__init__(pcap, reportPath)
+ self.groundtruth = {rawSeg: typSeg[1].fieldtype if typSeg[0] > 0.5 else "[unknown]"
+ for rawSeg, typSeg in self.typedMatchTemplates.items()}
+
+ def write(self, clusters: Dict[str, Union[MessageSegment, Template]], runtitle: Union[str, Dict]=None):
+ self._writeCSV(clusters, runtitle)
+
+ def _writeCSV(self, clusters: Dict[str, Union[MessageSegment, Template]], runtitle: Union[str, Dict]=None):
+ """Add the report to the appropriate CSV. Appends rows, if the CSV already exists."""
+ outfile = self._buildOutFilename(runtitle)
+ self._printMessage(outfile)
+
+ typedMatchTemplates = self.typedMatchTemplates # type: Dict[Union[Template, MessageSegment], Tuple[float, Union[TypedSegment, TypedTemplate, Template, MessageSegment]]]
+
+ with open(outfile, "a") as segfile:
+ segcsv = csv.writer(segfile)
+ segcsv.writerow(["Cluster", "Hex", "Bytes", "occurrence", "Data Type", "Overlap",
+ "| Field Name (Example)", "rStart", "rEnd"]) # r means "relative to the inferred segment"
+ for cLabel, segments in clusters.items(): # type: Tuple[str, Union[MessageSegment, Template]]
+ # if dc.segments != clusterer.segments:
+ # # Templates resolved to single Segments
+ # segcsv.writerows({(cLabel, seg.bytes.hex(), seg.bytes,
+ # typedMatchSegs[seg][1].fieldtype, typedMatchSegs[seg][0])
+ # for seg in segments})
+ # else:
+ # Templates as is
+ segcsv.writerows({
+ (
+ cLabel, seg.bytes.hex(), seg.bytes,
+ len(seg.baseSegments) if isinstance(seg, Template) else 1,
+ typedMatchTemplates[seg][1].fieldtype if SegmentClusterGroundtruthReport.segIsTyped(
+ typedMatchTemplates[seg][1]) else "[unknown]",
+ typedMatchTemplates[seg][0],
+ self._comparator.lookupField(
+ typedMatchTemplates[seg][1].baseSegments[0] if isinstance(typedMatchTemplates[seg][1],
+ Template)
+ else typedMatchTemplates[seg][1])[1],
+ *self.relativeOffsets(seg)
+ ) for seg in segments
+ })
+
+ @staticmethod
+ def segIsTyped(someSegment):
+ return isinstance(someSegment, (TypedTemplate, TypedSegment))
+
+ def relativeOffsets(self, infSegment):
+ """(Matched templates have offsets and lengths identical to seg (inferred) and not the true one.)"""
+ infSegment = infSegment.baseSegments[0] if isinstance(infSegment, Template) else infSegment
+ overlapRatio, overlapIndex, overlapStart, overlapEnd = self._comparator.fieldOverlap(infSegment)
+ return infSegment.offset - overlapStart, infSegment.nextOffset - overlapEnd
+
+ def _matchSegments(self):
+ # mark segment matches with > 50% overlap with the prevalent true data type for the nearest boundaries.
+ # list of tuples of overlap ratio ("intensity of match") and segment
+ typedMatchSegs = dict() # type: Dict[Union[Template, MessageSegment], Tuple[float, Union[TypedSegment, MessageSegment]]]
+ typedMatchTemplates = dict() # type: Dict[Union[Template, MessageSegment], Tuple[float, Union[TypedSegment, TypedTemplate, Template, MessageSegment]]]
+ for seg in self._segments:
+ # create typed segments/templates per cluster to get the inferred assignment
+ if isinstance(seg, MessageSegment):
+ typedMatchSegs[seg] = self._comparator.segment2typed(seg)
+ typedMatchTemplates[seg] = self._comparator.segment2typed(seg)
+ elif isinstance(seg, Template):
+ typedBaseSegments = [self._comparator.segment2typed(bs) for bs in seg.baseSegments]
+ typedMatchSegs.update({bs: ts for bs, ts in zip(seg.baseSegments, typedBaseSegments)})
+ if any(not isinstance(baseS, TypedSegment) for ratio, baseS in typedBaseSegments):
+ typedMatchTemplates[seg] = (-1.0, seg)
+ # we have no info about this segment's gt
+ continue
+
+ typeRatios = defaultdict(list)
+ for ratio, baseS in typedBaseSegments:
+ typeRatios[baseS.fieldtype].append(ratio)
+ # currently we need this only if there is only one matching type, but "for future use" calc all means.
+ meanRatios = {ft: numpy.mean(ro) for ft, ro in typeRatios.items()}
+ ftkeys = sorted(typeRatios.keys(), key=lambda x: -meanRatios[x])
+ machingType = ftkeys[0]
+
+ if len(typeRatios) > 1:
+ # print("Segment's matching field types are not the same in template, e. g., "
+ # "{} and {} ({})".format( machingType, tempTyped.fieldtype, tempTyped.bytes.hex() ))
+ typedMatchTemplates[seg] = (0.0, seg)
+ else:
+ typedMatchTemplates[seg] = (float(meanRatios[machingType]),
+ TypedTemplate(seg.values, [ts for _, ts in typedBaseSegments],
+ seg._method))
+ return typedMatchSegs, typedMatchTemplates
+
+ @property
+ def typedMatchSegs(self):
+ return self._typedMatchSegs
+
+ @property
+ def typedMatchTemplates(self):
+ return self._typedMatchTemplates
+
diff --git a/src/nemere/validation/dissectorMatcher.py b/src/nemere/validation/dissectorMatcher.py
index 0ec109cd..31dbfead 100644
--- a/src/nemere/validation/dissectorMatcher.py
+++ b/src/nemere/validation/dissectorMatcher.py
@@ -4,11 +4,14 @@
Methods to comparison of a list of messages' inferences and their dissections
and match a message's inference with its dissector in different ways.
"""
-
-from typing import List, Tuple, Dict, Iterable, Generator, Union
+from abc import ABC, abstractmethod
+from itertools import chain
+from typing import List, Tuple, Dict, Iterable, Generator, Union, Sequence
from collections import OrderedDict
import copy
+import math, numpy
+from netzob.Model.Vocabulary.Messages.L2NetworkMessage import L2NetworkMessage
from numpy import argmin
from netzob import all as netzob
@@ -19,29 +22,54 @@
from nemere.inference.segments import MessageSegment, TypedSegment
+# TODO find a suitable value
+# messageparsetimeout = 600
+messageparsetimeout = 60*120
+
+def stop_process_pool(executor):
+ for pid, process in executor._processes.items():
+ process.terminate()
+ executor.shutdown()
+
+class WatchdogTimeout(Exception):
+ pass
+
class FormatMatchScore(object):
"""
Object to hold all relevant data of an FMS.
"""
- message = None
- symbol = None
- trueFormat = None
- score = None
- specificyPenalty = None
- matchGain = None
- specificy = None
- nearWeights = None
- meanDistance = None # mean of "near" distances
- trueCount = None
- inferredCount = None
- exactCount = None
- nearCount = None
- exactMatches = None
- nearMatches = None
-
-
-
-class MessageComparator(object):
+ def __init__(self, message = None, symbol = None):
+ self.message = message
+ self.symbol = symbol
+ self.trueFormat = None
+ self.score = None
+ self.specificyPenalty = None
+ self.matchGain = None
+ self.specificy = None
+ self.nearWeights = None
+ self.meanDistance = None # mean of "near" distances
+ self.trueCount = None
+ self.inferredCount = None
+ self.exactCount = None
+ self.nearCount = None
+ self.exactMatches = None
+ self.nearMatches = None
+
+
+class BaseComparator(object):
+ """Dummy for using nemere.utils.evaluationHelpers.CachedDistances with a unknown protocol."""
+ import nemere.utils.loader as sl
+
+ def __init__(self, specimens: sl.SpecimenLoader, layer: int = -1, relativeToIP: bool = False, debug = False):
+ self.specimens = specimens
+ self.messages = specimens.messagePool # type: OrderedDict[AbstractMessage, netzob.RawMessage]
+ """:type messages: OrderedDict[AbstractMessage, RawMessage]"""
+ self.baselayer = specimens.getBaseLayerOfPCAP()
+ self.debug = debug
+ self._targetlayer = layer
+ self._relativeToIP = relativeToIP
+
+class MessageComparator(BaseComparator):
"""
Formal and visual comparison of a list of messages' inferences and their dissections.
@@ -53,21 +81,14 @@ class MessageComparator(object):
__messageCellCache = dict() # type: Dict[(netzob.Symbol, AbstractMessage), List]
- def __init__(self, specimens: sl.SpecimenLoader,
- layer: int = -1, relativeToIP: bool = False, failOnUndissectable=True,
- debug = False):
- self.specimens = specimens
- self.messages = specimens.messagePool # type: OrderedDict[AbstractMessage, netzob.RawMessage]
- """:type messages: OrderedDict[AbstractMessage, RawMessage]"""
- self.baselayer = specimens.getBaseLayerOfPCAP()
- self.debug = debug
+ def __init__(self, specimens: sl.SpecimenLoader, layer: int = -1, relativeToIP: bool = False,
+ failOnUndissectable=True, debug = False):
+ super().__init__(specimens, layer, relativeToIP, debug)
- # Cache messages that already have been parsed and labeled
- self._messageCache = dict() # type: Dict[netzob.RawMessage, ]
- self._targetlayer = layer
- self._relativeToIP = relativeToIP
self._failOnUndissectable = failOnUndissectable
+ # Cache messages that already have been parsed and labeled
+ self._messageCache = dict() # type: Dict[netzob.RawMessage, ]
self._dissections = self._dissectAndLabel(self.messages.values())
@@ -111,6 +132,8 @@ def _dissectAndLabel(self, messages: Iterable[netzob.RawMessage]) \
self._messageCache[m] = reparsed
labeledMessages[m] = self._messageCache[m]
+ ParsedMessage.closetshark()
+
return labeledMessages
@@ -122,7 +145,6 @@ def dissections(self) -> Dict[netzob.RawMessage, List[Tuple[str, int]]]:
def parsedMessages(self) -> Dict[netzob.RawMessage, ParsedMessage]:
return self._dissections
-
@staticmethod
def fieldEndsPerSymbol(nsymbol: netzob.Symbol, message: AbstractMessage):
"""
@@ -136,15 +158,37 @@ def fieldEndsPerSymbol(nsymbol: netzob.Symbol, message: AbstractMessage):
if not message in nsymbol.messages:
raise ValueError('Message in input symbol unknown by this comparator.')
+ from concurrent.futures.process import ProcessPoolExecutor
+ from concurrent.futures import TimeoutError as FutureTOError
+
# since netzob.Symbol.getMessageCells is EXTREMELY inefficient,
# we try to have it run as seldom as possible by caching its output
if (nsymbol, message) not in MessageComparator.__messageCellCache:
- # TODO wrap Symbols.getMessageCell in watchdog: run it in process and abort it after a timeout.
- # Look for my previous solution in siemens repos
# DHCP fails due to a very deep recursion here:
- mcells = nsymbol.getMessageCells(encoded=False) # dict of cells keyed by message
+ # Wrap Symbols.getMessageCell in watchdog: run it in process and abort it after a timeout.
+ # TODO Look for my previous solution in siemens repos
+ # Wait only messageparsetimeout seconds for Netzob's MessageParser to return the result
+ with ProcessPoolExecutor(max_workers=1) as executor:
+ try:
+ future = executor.submit(nsymbol.getMessageCells, encoded=False)
+ mcells = future.result(messageparsetimeout) # dict of cells keyed by message
+ msgIdMap = {msg.id: msg for msg in nsymbol.messages}
+ except FutureTOError as e:
+ stop_process_pool(executor)
+ raise WatchdogTimeout(f"Parsing of Netzob symbol {nsymbol.name} timed out after "
+ f"{messageparsetimeout} seconds.")
+ # Non-process call:
+ # mcells = nsymbol.getMessageCells(encoded=False) # dict of cells keyed by message
+ # for msg, fields in mcells.items():
+ # MessageComparator.__messageCellCache[(nsymbol, msg)] = fields
+ #
+ # IPC breaks the identity check of messages, since the object instance needs to be copied.
+ # Look up and use the correct message instances.
+ # TODO Keep in mind that this might break something since the fields
+ # still do contain references to the copied messages in:
+ # .messages and .parent.messages
for msg, fields in mcells.items():
- MessageComparator.__messageCellCache[(nsymbol, msg)] = fields
+ MessageComparator.__messageCellCache[(nsymbol, msgIdMap[msg.id])] = fields
mcontent = MessageComparator.__messageCellCache[(nsymbol, message)]
nfieldlengths = [len(field) for field in mcontent]
@@ -197,81 +241,27 @@ def uniqueFormats(onlyformats: Iterable) -> List[List[Tuple[str, int]]]:
return distinctFormats
- def pprint2Interleaved(self, message: AbstractMessage, inferredFieldEnds: List[int]=None,
+ def pprint2Interleaved(self, message: AbstractMessage, segmentsPerMsg: Sequence[Sequence[MessageSegment]]=tuple(),
mark: Union[Tuple[int,int], MessageSegment]=None,
messageSlice: Tuple[Union[int,None],Union[int,None]]=None):
"""
-
:param message: The message from which to print the byte hex values. Also used to look up the
true field boundaries to mark by spaces between in the printed byte hex values.
- :param inferredFieldEnds: The field ends that should be visualized by color changes.
+ :param segmentsPerMsg: The segments that should be visualized by color changes.
:param mark: Start and end indices of a range to mark by underlining.
:param messageSlice: Tuple used as parameters of the slice builtin to select a subset of all messages to print.
Use None to create an open slice (up to the beginning or end of the message).
- :return:
"""
- import nemere.visualization.bcolors as bc
-
- l2msg = self.messages[message]
- tformat = self.dissections[l2msg]
- tfe = MessageComparator.fieldEndsFromLength([l for t, l in tformat])
- msglen = len(message.data)
- absSlice = (
- messageSlice[0] if messageSlice is not None and messageSlice[0] is not None else 0,
- messageSlice[1] if messageSlice is not None and messageSlice[1] is not None else msglen
- )
- dataSnip = message.data if messageSlice is None else message.data[slice(*messageSlice)]
-
-
- ife = [0] + sorted(inferredFieldEnds if inferredFieldEnds is not None else self.fieldEndsPerMessage(message))
- ife += [msglen] if ife[-1] < msglen else []
-
- if mark is not None:
- if isinstance(mark, MessageSegment):
- mark = mark.offset, mark.nextOffset
- assert mark[0] >= absSlice[0], repr(mark) + repr(messageSlice)
- assert mark[1] <= absSlice[1], repr(mark) + repr(messageSlice)
-
- hexdata = list() # type: List[str]
- lastcolor = None
- for po, by in enumerate(dataSnip, absSlice[0]):
- # end mark
- if mark is not None and po == mark[1]:
- hexdata.append(bc.ENDC)
- # restart color after mark end
- if lastcolor is not None and lastcolor < po and po not in ife:
- hexdata.append(bc.eightBitColor(lastcolor % 231 + 1))
-
- # have a space in place of each true field end in the hex data.
- if po in tfe:
- hexdata.append(' ')
-
- # have a different color per each inferred field
- if po in ife:
- if po > 0:
- lastcolor = None
- hexdata.append(bc.ENDC)
- # restart mark after color change
- if mark is not None and mark[0] < po < mark[1]:
- hexdata.append(bc.UNDERLINE)
- if po < absSlice[1]:
- lastcolor = po
- hexdata.append(bc.eightBitColor(po % 231 + 1))
-
- # start mark
- if mark is not None and po == mark[0]:
- hexdata.append(bc.UNDERLINE)
-
- # add the actual value
- hexdata.append('{:02x}'.format(by))
- hexdata.append(bc.ENDC)
-
- print(''.join(hexdata))
+ from ..visualization.simplePrint import ComparingPrinter
+ rawmsg = self.messages[message] if isinstance(message, L2NetworkMessage) else message
+ cprinter = ComparingPrinter(self, segmentsPerMsg)
+ cprinter.toConsole([rawmsg], mark, messageSlice)
def __prepareMessagesForPPrint(self, symbols: Iterable[netzob.Symbol]) \
- -> List[List[Tuple[AbstractMessage, List[int], List[int]]]]:
+ -> List[List[Tuple[AbstractMessage, List[int], Union[List[int], WatchdogTimeout]]]]:
"""
+ Iterate symbols and their messages, determine boundary lists of true and inferred formats.
:param symbols:
:return: list (symbols) of list (messages) of tuple with message, true, and inferred field ends
@@ -286,8 +276,12 @@ def __prepareMessagesForPPrint(self, symbols: Iterable[netzob.Symbol]) \
tformat = self.dissections[l2msg]
msglen = len(msg.data)
tfe = MessageComparator.fieldEndsFromLength([l for t, l in tformat])
- ife = [0] + MessageComparator.fieldEndsPerSymbol(sym, msg)
- ife += [msglen] if ife[-1] < msglen else []
+ try:
+ # catch WatchdogTimeout in callers of fieldEndsPerSymbol
+ ife = [0] + MessageComparator.fieldEndsPerSymbol(sym, msg)
+ ife += [msglen] if ife[-1] < msglen else []
+ except WatchdogTimeout as e:
+ ife = e
msgfes.append((msg, tfe, ife))
symfes.append(msgfes)
return symfes
@@ -321,13 +315,16 @@ def pprintInterleaved(self, symbols: List[netzob.Symbol]):
# add the actual value
hexdata.append('{:02x}'.format(by))
- hexdata.append(bc.ENDC)
+ if isinstance(ife, WatchdogTimeout):
+ # handle Netzob WatchdogTimeout in fieldEndsPerSymbol
+ print(str(ife), end=" ")
+ else:
+ hexdata.append(bc.ENDC)
print(''.join(hexdata))
print('true fields: SPACE | inferred fields: color change')
-
def lprintInterleaved(self, symbols: List[netzob.Symbol]):
"""
Generate LaTeX source code visualizing the interleaved true and inferred format upon the byte values
@@ -352,7 +349,7 @@ def lprintInterleaved(self, symbols: List[netzob.Symbol]):
hexdata.append(tfemarker)
# have a different color per each inferred field
- if po in ife:
+ if not isinstance(ife, WatchdogTimeout) and po in ife:
if po > 0:
hexdata.append(ifeendmarker)
if po < len(msg.data):
@@ -360,13 +357,18 @@ def lprintInterleaved(self, symbols: List[netzob.Symbol]):
# add the actual value
hexdata.append('{:02x}'.format(by))
- hexdata.append(ifeendmarker)
- texcode += '\\noindent\n\\texttt{' + ''.join(hexdata) + '}\n\n'
+ if isinstance(ife, WatchdogTimeout):
+ # handle Netzob WatchdogTimeout in fieldEndsPerSymbol
+ note = str(ife) + "\\hspace{3em}"
+ else:
+ hexdata.append(ifeendmarker)
+ note = ""
+ texcode += '\\noindent\n' + note + '\\texttt{' + ''.join(hexdata) + '}\n\n'
texcode += '\\bigskip\ntrue fields: SPACE | inferred fields: framed box'
return texcode
- def tprintInterleaved(self, symbols: Iterable[netzob.Symbol]):
+ def tprintInterleaved(self, symbols: Sequence[netzob.Symbol]):
"""
Generate tikz source code visualizing the interleaved true and inferred format upon the byte values
of each message in the symbols.
@@ -375,44 +377,101 @@ def tprintInterleaved(self, symbols: Iterable[netzob.Symbol]):
Also see self.pprintInterleaved doing the same for terminal output.
+ TODO cleanup: adapt and use nemere.visualization.simplePrint.ComparingPrinter (which is functionally equivalent)
+
:param symbols: Inferred symbols
- :return LaTeX code
+ :return LaTeX/tikz code
"""
tfemarker = '1ex '
- texcode = """
-\\begin{tikzpicture}[node distance=0pt, yscale=.5,
- every node/.style={font=\\ttfamily, text height=.7em, outer sep=0, inner sep=0},
- tfe/.style={draw, minimum height=1.2em, thick}]
-"""
+ texcode = ""
+
+ ftlabels = set()
+ for sym in symbols:
+ for msg in sym.messages:
+ pm = self.parsedMessages[self.messages[msg]]
+ ftlabels.update(t[0] for t in pm.getTypeSequence())
+ ftstyles = {lab: "fts" + lab.replace("_", "").replace(" ", "") for lab in ftlabels} # field-type label to style name
+ ftcolornames = {tag: "col" + tag[3:] for lab, tag in ftstyles.items() } # style name to color name
+ ftcolors = list() # color definition
+ for tag in ftcolornames.values():
+ lightness = 0 # choose only light colors
+ while lightness < .5:
+ rgb = numpy.random.rand(3, )
+ lightness = 0.5 * min(rgb) + 0.5 * max(rgb)
+ ftcolors.append( f"\definecolor{{{tag}}}{{rgb}}{{{rgb[0]},{rgb[1]},{rgb[2]}}}" )
+ texcode += "\n ".join(ftcolors) + "\n"
+
+ styles = ["every node/.style={font=\\ttfamily, text height=.7em, outer sep=0, inner sep=0}",
+ "tfe/.style={draw, minimum height=1.2em, thick}", "tfelabel/.style={rotate=-20, anchor=north west}"]
+ styles += [f"{sty}/.style={{fill={ftcolornames[sty]}}}" for sty in ftstyles.values() ]
+
+ texcode += "\n\\begin{tikzpicture}[node distance=0pt, yscale=2,\n"
+ texcode += ",\n".join(styles) + "]"
for symid, symfes in enumerate(self.__prepareMessagesForPPrint(symbols)):
for msgid, (msg, tfe, ife) in enumerate(symfes):
+ pm = self.parsedMessages[self.messages[msg]]
+ offset2type = list(chain.from_iterable( [lab]*lgt for lab, lgt in pm.getTypeSequence() ))
+ offset2name = dict()
+ offset = 0
+ for name, lgt in pm.getFieldSequence():
+ offset2name[offset] = name.replace("_", "\\_")
+ offset += lgt
+
smid = symid + msgid
hexdata = list() # type: List[str]
hexdata.append('\n\n\\coordinate(m{}f0) at (0,{});'.format(smid, -smid))
for po, by in enumerate(msg.data, start=1):
# add the actual value
- hexdata.append('\\node[right={}of m{}f{}] (m{}f{}) {{{:02x}}};'.format(
+ hexdata.append('\\node[right={}of m{}f{}, {}{}] (m{}f{}) {{{:02x}}};'.format(
# have a 1ex space in place of each true field end in the hex data.
- tfemarker if po-1 in tfe else '', smid, po-1, smid, po, by))
+ tfemarker if po-1 in tfe else '', smid, po-1,
+ # style for the field type
+ ftstyles[offset2type[po-1]],
+ f", label={{[tfelabel]below:\\sffamily\\tiny {offset2name[po-1]}}}" if po-1 in offset2name else "",
+ smid, po, by)
+ )
texcode += '\n'.join(hexdata)
# have a frame per each inferred field
fitnodes = list()
- for pol, por in zip(ife[:-1], ife[1:]):
+ if isinstance(ife, WatchdogTimeout):
+ # handle Netzob WatchdogTimeout in fieldEndsPerSymbol
fitnodes.append(
- '\\node[fit=(m{}f{})(m{}f{}), tfe] {{}};'.format(smid, pol+1, smid, por)
+ '\\node[] at (m{}f0) {{{}}};'.format(smid, str(ife))
)
+ else:
+ for pol, por in zip(ife[:-1], ife[1:]):
+ fitnodes.append(
+ '\\node[fit=(m{}f{})(m{}f{}), tfe] {{}};'.format(smid, pol+1, smid, por)
+ )
+ # TODO add the inferred field's "truest" type as label
texcode += '\n' + '\n'.join(fitnodes)
texcode += """
\end{tikzpicture}
+\\centering
\\bigskip\ntrue fields: SPACE | inferred fields: framed box
+
+True field type colors:\\\\
"""
- return texcode
+ for lab, tag in ftstyles.items():
+ texlab = lab.replace("_", "\\_")
+ texcode += f"\\colorbox{{{ftcolornames[tag]}}}{{{texlab}}}\\\\\n"
+
+ return texcode + "\n"
def segment2typed(self, segment: MessageSegment) -> Tuple[float, Union[TypedSegment, MessageSegment]]:
+ overlapRatio, overlapIndex, overlapStart, overlapEnd = self.fieldOverlap(segment)
+ messagetype, fieldname, fieldtype = self.lookupField(segment)
+
+ # return a typed version of the segment and the ratio of overlapping bytes to the segment length
+ return overlapRatio, TypedSegment(segment.analyzer, segment.offset, segment.length, fieldtype)
+
+
+ def fieldOverlap(self, segment: MessageSegment):
+ """Overlap info between segment and its closest true field."""
parsedMessage = self.parsedMessages[self.messages[segment.message]]
fieldSequence = list()
off = 0
@@ -453,10 +512,7 @@ def segment2typed(self, segment: MessageSegment) -> Tuple[float, Union[TypedSegm
assert trueEnd in fieldSequence, "Field sequence is not matching any possible overlap. Investigate!"
overlapIndex = fieldSequence.index(trueEnd)
- fieldtype = parsedMessage.getTypeSequence()[overlapIndex][0]
-
- # return a typed version of the segment and the ratio of overlapping bytes to the segment length
- return overlapRatio, TypedSegment(segment.analyzer, segment.offset, segment.length, fieldtype)
+ return overlapRatio, overlapIndex, overlapStart, overlapEnd
def lookupField(self, segment: MessageSegment):
@@ -472,13 +528,11 @@ def lookupField(self, segment: MessageSegment):
field name (from tshark nomenclature),
field type (from ParsingConstants in messageParser.py)
"""
- pm = self.parsedMessages[self.messages[segment.message]]
- fs = pm.getFieldSequence()
- fsnum, offset = 0, 0
- while offset < segment.offset:
- offset += fs[fsnum][1]
- fsnum += 1
- return pm.messagetype, fs[fsnum][0], pm.getTypeSequence()[fsnum][0]
+ parsedMessage = self.parsedMessages[self.messages[segment.message]]
+ overlapRatio, overlapIndex, overlapStart, overlapEnd = self.fieldOverlap(segment)
+
+ return parsedMessage.messagetype, \
+ parsedMessage.getFieldSequence()[overlapIndex][0], parsedMessage.getTypeSequence()[overlapIndex][0]
def segmentInfo(self, segment: MessageSegment):
@@ -498,6 +552,7 @@ def segmentInfo(self, segment: MessageSegment):
def lookupValues4FieldName(self, fieldName: str):
"""
Lookup the values for a given field name in all messages.
+ # TODO comparator.lookupValues4FieldName with list of messages (i.e., cluster elements)
:param fieldName: name of field (according to tshark nomenclature)
:return: List of values of all fields carrying the given field name
@@ -508,64 +563,51 @@ def lookupValues4FieldName(self, fieldName: str):
return values
-class DissectorMatcher(object):
+class AbstractDissectorMatcher(ABC):
"""
Incorporates methods to match a message's inference with its dissector in different ways.
Dissections been are done by MessageComparator so this class does not need direct interaction
with tshark nor any knowledge of layer, relativeToIP, and failOnUndissectable.
"""
-
- def __init__(self, mc: MessageComparator, inferredSymbol: netzob.Symbol, message:AbstractMessage=None):
+ @abstractmethod
+ def __init__(self, mc: MessageComparator, message: AbstractMessage=None):
"""
Prepares matching of one message's (or message type's) inference with its dissector.
:param mc: the object holding the low level message and dissection information
- :param inferredSymbol: Symbol from inference to match, The corresponding dissection is implicit by the message.
- :param message: If not given, uses the first message in the inferredSymbol, otherwise this message is used
- to determine a dissection and a instantiation of the inference for comparison.
+ :param message: Message in segments to match.
"""
self.debug = False
-
- self.__message = None
- """L4 Message"""
- if message:
- assert message in inferredSymbol.messages
- self.__message = message
- else:
- assert len(inferredSymbol.messages) > 0
- self.__message = inferredSymbol.messages[0]
-
- self.__comparator = mc
+ self._message = message
+ self._comparator = mc
"""set of specimens message is contained in"""
- self.__inferredSymbol = inferredSymbol
- """Symbol from inference"""
- self.__inferredFields, self.__dissectionFields = self._inferredandtrueFieldEnds(inferredSymbol)
- """Lists of field ends including message end"""
+ tformat = self._comparator.parsedMessages[self._comparator.messages[self._message]].getTypeSequence()
+ tfieldlengths = [fieldlength for sfieldtype, fieldlength in tformat]
+ self._dissectionFields = MessageComparator.fieldEndsFromLength(tfieldlengths)
+ """Lists of field ends including message end"""
+ self._inferredFields = None # must be filled by subclass!
@property
def inferredFields(self):
"""
:return: List of inferred field ends.
"""
- return self.__inferredFields
-
+ return self._inferredFields
@property
def dissectionFields(self):
"""
:return: List of true field ends according to the dissection.
"""
- return self.__dissectionFields
-
+ return self._dissectionFields
def exactMatches(self) -> List[int]:
"""
:return: exact matches of field ends in dissection and inference (excluding beginning and end of message)
"""
- return [dife for dife in self.__dissectionFields[:-1] if dife in self.__inferredFields[:-1]]
-
+ return [dife for dife in self._dissectionFields[:-1] if dife in self._inferredFields[:-1]]
def nearMatches(self) -> Dict[int, int]:
"""
@@ -575,41 +617,14 @@ def nearMatches(self) -> Dict[int, int]:
difescopes = self.dissectorFieldEndScopes()
nearmatches = dict() # dife : nearest infe if in scope
for dife, piv in difescopes.items():
- ininscope = [infe for infe in self.__inferredFields if piv[0] <= infe <= piv[1]]
+ ininscope = [infe for infe in self._inferredFields if piv[0] <= infe <= piv[1]]
if len(ininscope) == 0:
continue
closest = argmin([abs(dife - infe) for infe in ininscope]).astype(int)
+ # noinspection PyTypeChecker
nearmatches[dife] = ininscope[closest]
return nearmatches
-
- def inferredInDissectorScopes(self) -> Dict[int, List[int]]:
- """
- :return: any matches of field ends in dissection and inference (excluding beginning and end of message).
- Depends on the scopes returned by self.allDissectorFieldEndScopes()
- """
- difescopes = self.allDissectorFieldEndScopes()
- nearmatches = dict() # dife : nearest infe if in scope
- for dife, piv in difescopes.items():
- ininscope = [infe for infe in self.__inferredFields if piv[0] <= infe <= piv[1]]
- if len(ininscope) == 0:
- continue
- nearmatches[dife] = ininscope
- return nearmatches
-
-
- def distancesFromDissectorFieldEnds(self) -> Dict[int, List[int]]:
- """
- get distances for all inferred fields per true field.
-
- :return: dict(true field ends: List[signed inferred distances])
- negative distances are inferred fields ends left to the true field end
- """
- inferredForTrue = self.inferredInDissectorScopes()
- return {tfe: [ife - tfe for ife in ifes] for tfe, ifes in inferredForTrue.items()}
-
-
-
def dissectorFieldEndScopes(self) -> Dict[int, Tuple[int, int]]:
"""
:return: Byte position ranges (scopes) of field ends that are no exact matches and are longer than zero.
@@ -618,14 +633,14 @@ def dissectorFieldEndScopes(self) -> Dict[int, Tuple[int, int]]:
exactMatches = self.exactMatches()
difescopes = dict()
- for idxl in range(len(self.__dissectionFields)-1):
- center = self.__dissectionFields[idxl]
+ for idxl in range(len(self._dissectionFields) - 1):
+ center = self._dissectionFields[idxl]
if center in exactMatches:
continue # if there is an exact match on this field,
# do not consider any other inferred field ends in its scope.
- left = self.__dissectionFields[idxl-1]
- right = self.__dissectionFields[idxl+1]
+ left = self._dissectionFields[idxl - 1]
+ right = self._dissectionFields[idxl + 1]
# single byte fields never can have near matches, therefore the if ... else
pivl = left + (center - left) // 2 if center - left > 1 else center
@@ -638,16 +653,15 @@ def dissectorFieldEndScopes(self) -> Dict[int, Tuple[int, int]]:
difescopes[center] = (pivl, pivr)
return difescopes
-
def allDissectorFieldEndScopes(self) -> Dict[int, Tuple[int, int]]:
"""
:return: All byte position ranges (scopes) of field ends regardless whether they are exact matches.
"""
difescopes = dict()
- for idxl in range(len(self.__dissectionFields)-1):
- center = self.__dissectionFields[idxl]
- left = self.__dissectionFields[idxl-1]
- right = self.__dissectionFields[idxl+1]
+ for idxl in range(len(self._dissectionFields) - 1):
+ center = self._dissectionFields[idxl]
+ left = self._dissectionFields[idxl - 1]
+ right = self._dissectionFields[idxl + 1]
# single byte fields never can have near matches, therefore the if ... else
pivl = left + (center - left) // 2 if center - left > 1 else center
@@ -658,8 +672,141 @@ def allDissectorFieldEndScopes(self) -> Dict[int, Tuple[int, int]]:
return difescopes
+ def inferredInDissectorScopes(self) -> Dict[int, List[int]]:
+ """
+ :return: any matches of field ends in dissection and inference (excluding beginning and end of message).
+ Depends on the scopes returned by self.allDissectorFieldEndScopes()
+ """
+ difescopes = self.allDissectorFieldEndScopes()
+ nearmatches = dict() # dife : nearest infe if in scope
+ for dife, piv in difescopes.items():
+ ininscope = [infe for infe in self._inferredFields if piv[0] <= infe <= piv[1]]
+ if len(ininscope) == 0:
+ continue
+ nearmatches[dife] = ininscope
+ return nearmatches
+
+ def distancesFromDissectorFieldEnds(self) -> Dict[int, List[int]]:
+ """
+ get distances for all inferred fields per true field.
+
+ :return: dict(true field ends: List[signed inferred distances])
+ negative distances are inferred fields ends left to the true field end
+ """
+ inferredForTrue = self.inferredInDissectorScopes()
+ return {tfe: [ife - tfe for ife in ifes] for tfe, ifes in inferredForTrue.items()}
- def _inferredandtrueFieldEnds(self, nsymbol: netzob.Symbol, tformat: List[Tuple]=None)\
+ def calcFMS(self):
+ exactmatches = self.exactMatches()
+ nearmatches = self.nearMatches() # TODO check the associated inferred field index (sometimes wrong?)
+ nearestdistances = {tfe: min(dists) for tfe, dists
+ in self.distancesFromDissectorFieldEnds().items()
+ if tfe in nearmatches}
+ # fieldendscopes = dm.dissectorFieldEndScopes()
+
+ exactcount = len(exactmatches)
+ nearcount = len(nearmatches)
+ fieldcount = len(self.dissectionFields) - 1
+ inferredcount = len(self.inferredFields) - 1
+
+ # nearmatches weighted by distance, /2 to increase spread -> less intense penalty for deviation from 0
+ nearweights = {tfe: math.exp(- ((dist / 2) ** 2))
+ for tfe, dist in nearestdistances.items()}
+
+ # penalty for over-/under-specificity (normalized to true field count)
+ try:
+ specificyPenalty = math.exp(- ((fieldcount - inferredcount) / fieldcount) ** 2)
+ except ZeroDivisionError:
+ raise ZeroDivisionError("Offending message:\n{}".format(self._message.data.hex()))
+ matchGain = (exactcount + sum([nearweights[nm] for nm in nearmatches.keys()])) / fieldcount
+ score = specificyPenalty * (
+ # exact matches + weighted near matches
+ matchGain)
+
+ fms = FormatMatchScore(self._message)
+ fms.trueFormat = self._comparator.parsedMessages[self._comparator.messages[self._message]].getTypeSequence()
+ fms.score = score
+ fms.specificyPenalty = specificyPenalty
+ fms.matchGain = matchGain
+ fms.nearWeights = nearweights
+ fms.meanDistance = numpy.mean(list(nearestdistances.values())) if len(nearestdistances) > 0 else numpy.nan
+ fms.trueCount = fieldcount
+ fms.inferredCount = inferredcount
+ fms.exactCount = exactcount
+ fms.nearCount = nearcount
+ fms.specificy = fieldcount - inferredcount
+ fms.exactMatches = exactmatches
+ fms.nearMatches = nearmatches
+
+ return fms
+
+
+class BaseDissectorMatcher(AbstractDissectorMatcher):
+ """
+ Incorporates methods to match a message's inference with its dissector in different ways.
+
+ Dissections been are done by MessageComparator so this class does not need direct interaction
+ with tshark nor any knowledge of layer, relativeToIP, and failOnUndissectable.
+ """
+ def __init__(self, mc: MessageComparator, messageSegments: List[MessageSegment]):
+ """
+ Prepares matching of one message's (or message type's) inference with its dissector.
+
+ :param mc: the object holding the low level message and dissection information
+ :param messageSegments: Message in segments from inference to match in offset order,
+ The corresponding dissection is implicit by the message.
+ """
+ # check messageSegments is consistent for exactly one message (and is not empty)
+ assert all(messageSegments[0].message == seg.message for seg in messageSegments)
+ super().__init__(mc, messageSegments[0].message)
+
+ self.__messageSegments = messageSegments
+ """Message in segments from inference"""
+ self._inferredFields = [0] + [seg.nextOffset for seg in messageSegments]
+ if self._inferredFields[-1] < len(self._message.data):
+ self._inferredFields += [len(self._message.data)]
+
+
+class DissectorMatcher(AbstractDissectorMatcher):
+ """
+ Incorporates methods to match a message's inference with its dissector in different ways.
+
+ TODO: check where this can be replaced by BaseDissectorMatcher, which performs better,
+ due to the omitted parsing of Netzob Symbols.
+
+ Dissections been are done by MessageComparator so this class does not need direct interaction
+ with tshark nor any knowledge of layer, relativeToIP, and failOnUndissectable.
+ """
+
+ def __init__(self, mc: MessageComparator, inferredSymbol: netzob.Symbol, message: AbstractMessage=None):
+ """
+ Prepares matching of one message's (or message type's) inference with its dissector.
+
+ :param mc: the object holding the low level message and dissection information
+ :param inferredSymbol: Symbol from inference to match, The corresponding dissection is implicit by the message.
+ :param message: If not given, uses the first message in the inferredSymbol, otherwise this message is used
+ to determine a dissection and a instantiation of the inference for comparison.
+
+ :raises WatchdogTimeout: If Netzob symbol parsing times out
+ """
+ """L4 Message"""
+ if message:
+ assert message in inferredSymbol.messages
+ else:
+ assert len(inferredSymbol.messages) > 0
+ message = inferredSymbol.messages[0]
+ super().__init__(mc, message)
+
+ self._inferredSymbol = inferredSymbol
+ """Symbol from inference"""
+ try:
+ self._inferredFields, self._dissectionFields = self._inferredandtrueFieldEnds(inferredSymbol)
+ """Lists of field ends including message end"""
+ except RuntimeError as e:
+ print("Runtime error (probably due to a Netzob message parsing error) gracefully handled.")
+ raise e
+
+ def _inferredandtrueFieldEnds(self, nsymbol: netzob.Symbol, tformat: List[Tuple]=None) \
-> Tuple[List[int],List[int]]:
"""
Determines the field ends of an inferred Symbol
@@ -671,17 +818,19 @@ def _inferredandtrueFieldEnds(self, nsymbol: netzob.Symbol, tformat: List[Tuple]
If not given, determines and uses the true format of the first message in the Symbol.
:return: inferred field ends; true field ends
+
+ :raises WatchdogTimeout: If Netzob symbol parsing times out
"""
# Fallback, which uses only the first message in the symbol to determine a dissection
if tformat is None:
if self.debug:
print("Determine true formats for symbol via tshark...")
# get the raw message for the first layer 5 message in nsymbol and dissect
- tformat = list(self.__comparator.dissections[self.__comparator.messages[nsymbol.messages[0]]])
+ tformat = list(self._comparator.dissections[self._comparator.messages[nsymbol.messages[0]]])
samplemessage = nsymbol.messages[0]
else:
- l2msgs = { self.__comparator.messages[msg]: msg for msg in nsymbol.messages }
- tformats = {k: self.__comparator.dissections[k] for k in l2msgs.keys()}
+ l2msgs = {self._comparator.messages[msg]: msg for msg in nsymbol.messages}
+ tformats = {k: self._comparator.dissections[k] for k in l2msgs.keys()}
samplemessage = None # initialize to later check on it
# get a sample for the tformat
for m, tf in tformats.items():
@@ -733,6 +882,7 @@ def _inferredandtrueFieldEnds(self, nsymbol: netzob.Symbol, tformat: List[Tuple]
#####
# Lists of inferred and dissector field end indices in byte within message:
# determine the indices of the field ends in the message byte sequence
+ # ... Here the WatchdogTimeout is raised
nfieldends = MessageComparator.fieldEndsPerSymbol(nsymbol, samplemessage)
tfieldends = MessageComparator.fieldEndsFromLength(tfieldlengths)
@@ -743,62 +893,16 @@ def _inferredandtrueFieldEnds(self, nsymbol: netzob.Symbol, tformat: List[Tuple]
return nfieldends, tfieldends
-
def calcFMS(self):
- import math, numpy
- from netzob.Model.Vocabulary.Messages.RawMessage import RawMessage
-
fmslist = list()
- l2rmsgs = {self.__comparator.messages[msg]: msg for msg in self.__inferredSymbol.messages}
- tformats = {k: self.__comparator.dissections[k] for k in l2rmsgs.keys()} # l2msg: tuple
- for l2msg, l4msg in l2rmsgs.items(): # type: (RawMessage, AbstractMessage)
- exactmatches = self.exactMatches()
- nearmatches = self.nearMatches() # TODO check the associated inferred field index (sometimes wrong?)
- nearestdistances = {tfe: min(dists) for tfe, dists
- in self.distancesFromDissectorFieldEnds().items()
- if tfe in nearmatches}
- # fieldendscopes = dm.dissectorFieldEndScopes()
-
- exactcount = len(exactmatches)
- nearcount = len(nearmatches)
- fieldcount = len(self.dissectionFields) - 1
- inferredcount = len(self.inferredFields) - 1
-
- # nearmatches weighted by distance, /2 to increase spread -> less intense penalty for deviation from 0
- nearweights = {tfe: math.exp(- ((dist / 2) ** 2))
- for tfe, dist in nearestdistances.items()}
-
- # penalty for over-/under-specificity (normalized to true field count)
- try:
- specificyPenalty = math.exp(- ((fieldcount - inferredcount) / fieldcount) ** 2)
- except ZeroDivisionError:
- raise ZeroDivisionError("Offending message:\n{}".format(l4msg.data.hex()))
- matchGain = (exactcount + sum([nearweights[nm] for nm in nearmatches.keys()])) / fieldcount
- score = specificyPenalty * (
- # exact matches + weighted near matches
- matchGain)
-
- fms = FormatMatchScore()
- fms.message = l4msg
- fms.symbol = self.__inferredSymbol
- fms.trueFormat = tformats[l2msg]
- fms.score = score
- fms.specificyPenalty = specificyPenalty
- fms.matchGain = matchGain
- fms.nearWeights = nearweights
- fms.meanDistance = numpy.mean(list(nearestdistances.values())) if len(nearestdistances) > 0 else numpy.nan
- fms.trueCount = fieldcount
- fms.inferredCount = inferredcount
- fms.exactCount = exactcount
- fms.nearCount = nearcount
- fms.specificy = fieldcount - inferredcount
- fms.exactMatches = exactmatches
- fms.nearMatches = nearmatches
-
+ for msg in self._inferredSymbol.messages:
+ # TODO calculate independent FMSs for each symbol member mesasge, since currently
+ # this does result in an FMS that is identical for all messages within the symbol!
+ fms = super().calcFMS()
+ fms.symbol = self._inferredSymbol
fmslist.append(fms)
return fmslist
-
@staticmethod
def symbolListFMS(mc: MessageComparator, symbols: List[netzob.Symbol]) -> Dict[AbstractMessage, FormatMatchScore]:
"""
@@ -808,19 +912,31 @@ def symbolListFMS(mc: MessageComparator, symbols: List[netzob.Symbol]) -> Dict[A
:param symbols: list of inferred symbols
:return: OrderedDict of messages mapping to their FormatMatchScore
"""
-
matchprecisions = OrderedDict()
for counter, symbol in enumerate(symbols):
symbol.name = "{:s}{:2d}".format(symbol.name, counter)
-
- dm = DissectorMatcher(mc, symbol)
- fmslist = dm.calcFMS()
- for fms in fmslist:
- matchprecisions[fms.message] = fms
+ try:
+ try:
+ dm = DissectorMatcher(mc, symbol)
+ except WatchdogTimeout as e:
+ print(e, "Continuing with next symbol...")
+ for msg in symbol.messages:
+ matchprecisions[msg] = FormatMatchScore(msg, symbol) # add empty dummy FMS
+ continue
+ fmslist = dm.calcFMS()
+ for fms in fmslist:
+ matchprecisions[fms.message] = fms
+ except RuntimeError as e:
+ print("\n\n# # # Messages # # #\n")
+ for msg in symbol.messages:
+ # # add dummy entries without values to denote (most probably) failed message parsing by Netzob
+ # matchprecisions[msg] = FormatMatchScore(msg, symbol)
+ print(msg.data.hex())
+ print()
+ raise e
return matchprecisions
-
@staticmethod
def thresymbolListsFMS(mc: MessageComparator,
threshSymbTfmt: Dict[int, Dict[netzob.Symbol, List[List[Tuple[str, int]]]]]) \
diff --git a/src/nemere/validation/messageParser.py b/src/nemere/validation/messageParser.py
index 3cece653..a407444f 100644
--- a/src/nemere/validation/messageParser.py
+++ b/src/nemere/validation/messageParser.py
@@ -3,35 +3,399 @@
Interpret fields and data types for comparison to an inference result.
"""
-import json
-from typing import List, Tuple, Union, Dict, Set, Union, Any, Callable
+import json, re
+from typing import List, Tuple, Dict, Set, Union, Generator, Type, Sequence, Any, Callable
from pprint import pprint
+from itertools import chain
+import inspect
import IPython
from netzob.Model.Vocabulary.Messages.RawMessage import RawMessage, AbstractMessage
+from nemere.validation import protocols
from nemere.validation.tsharkConnector import TsharkConnector
+class MessageTypeIdentifiers(object):
+ """
+ Fields or combinations of field that identify a message type for a specific protocol
+ """
+
+ def __init__(self, compatibleProtocols: Sequence[Type['ParsingConstants']]):
+ self._collect_superclasses()
+ for p in compatibleProtocols:
+ self.importProtocol(p.MESSAGE_TYPE_IDS)
+
+ FOR_PROTCOL = dict()
+ NAMED_TYPES = dict()
+
+ def _collect_superclasses(self):
+ supers = type(self).mro()
+ for mti in supers:
+ if issubclass(mti, MessageTypeIdentifiers):
+ self.importProtocol(mti)
+
+ def __resolveTypeName(self, fieldname: str, fieldvalue: str):
+ return self.NAMED_TYPES[fieldname][fieldvalue] \
+ if fieldname in self.NAMED_TYPES \
+ and fieldvalue in self.NAMED_TYPES[fieldname] \
+ else "{}={}".format(fieldname, fieldvalue)
+
+ def typeOfMessage(self, message: 'ParsedMessage'):
+ if message.protocolname in self.FOR_PROTCOL:
+ idFields = self.FOR_PROTCOL[message.protocolname]
+ resolvedTypeName = []
+ for ifield in idFields:
+ if isinstance(ifield, dict): # complex type identifiers with filter and selector
+ ifv = message.getValuesByName(ifield['field'])
+ if not ifv:
+ continue # to next field
+ for idvalue in ifv:
+ if ifield['filter'](idvalue):
+ selectedid = ifield['select'](idvalue)
+ resolvedTypeName.append(
+ self.__resolveTypeName(ifield['field'], selectedid))
+ else: # simple identifier
+ selectedid = message.getValuesByName(ifield)
+ for ifv in selectedid:
+ # noinspection PyTypeChecker
+ resolvedTypeName.append(self.__resolveTypeName(ifield, ifv))
+ if len(resolvedTypeName) > 0:
+ return ":".join(resolvedTypeName)
+
+ # message identifier not known (outer if-statement)
+ # or filter never matched (if-statement inside dict handling branch)
+ raise Exception("No message type identifier known for protocol {}".format(message.protocolname))
+
+ @staticmethod
+ def updateRecursive(d: Dict[Any, Union[Dict, List, Any]], u: Dict[Any, Union[Dict, List, Any]]):
+ """
+ Update dict d with values from dict u. If u's value is a dict,
+ recursively update the corresponding dict in d's value.
+ """
+ for k, v in u.items():
+ if isinstance(v, Dict):
+ d[k] = MessageTypeIdentifiers.updateRecursive(d.get(k, {}), v)
+ elif isinstance(v, List): # a list value is not overwritten but extended with the values in u's list
+ listFromD = d.get(k, [])
+ listFromD.extend(v)
+ d[k] = MessageTypeIdentifiers._unique(listFromD)
+ else:
+ d[k] = v
+ return d
+
+ @staticmethod
+ def _unique(discriminatorList: List[Union[str, Dict[str, Union[str, Callable]]]]):
+ """removes duplicates from discriminatorList while keeping its order.
+ If a complex filter for the same field is defined a second time, the second one is used
+ at the position of the first appearance of a filter for this field."""
+ newList = []
+ for discriminator in discriminatorList:
+ if isinstance(discriminator, str) and discriminator not in newList:
+ newList.append(discriminator)
+ elif isinstance(discriminator, Dict):
+ replaced = False
+ for index, newEntry in enumerate(newList):
+ if isinstance(newEntry, Dict) and newEntry['field'] == discriminator['field']:
+ newList[index] = discriminator
+ replaced = True
+ break
+ if not replaced:
+ newList.append(discriminator)
+ return newList
+
+ def importProtocol(self, mtid: Type['MessageTypeIdentifiers']):
+ MessageTypeIdentifiers.updateRecursive(self.FOR_PROTCOL, mtid.FOR_PROTCOL)
+ MessageTypeIdentifiers.updateRecursive(self.NAMED_TYPES, mtid.NAMED_TYPES)
+
+
+class MessageTypeIdentifiers226(MessageTypeIdentifiers):
+ FOR_PROTCOL = {
+ 'bootp' : ['bootp.option.dhcp'],
+ 'dns' : ['dns.flags', 'dns.qry.type'],
+ 'nbns' : ['nbns.flags'],
+ 'nbss' : ['nbss.type', {
+ 'field': 'smb.cmd',
+ 'filter': lambda v: v != 'ff',
+ 'select': lambda w: w
+ }, {
+ 'field': 'smb.flags',
+ 'filter': lambda v: True,
+ 'select': lambda w: (int.from_bytes(bytes.fromhex(w), "big") & 128) != 0 # first bit denotes request/response
+ }],
+ 'ntp' : ['ntp.flags', 'ntp.stratum']
+ }
+
+ NAMED_TYPES = { # assumes hex bytes are lower-case
+ 'bootp.option.dhcp' : {
+ '01': 'Discover',
+ '02': 'Offer',
+ '03': 'Request',
+ '04': 'Decline',
+ '05': 'ACK',
+ '07': 'Release',
+ '08': 'Inform',
+ },
+ 'nbss.type' : {
+ '00': 'SMB'
+ },
+ 'dns.flags' : {
+ '0100': 'Standard query',
+ '8182': 'Response (failure)',
+ '8183': 'Response (no such name)',
+ '8580': 'Response (success)',
+ },
+ 'dns.qry.type': {
+ '0001': 'A',
+ '0002': 'NS',
+ '0010': 'TXT',
+ '001c': 'AAAA',
+ '000f': 'MX',
+ '000c': 'PTR',
+ '0006': 'SOA',
+ '0021': 'SRV',
+ },
+ 'smb.cmd': {
+ '04': 'Close (0x04)',
+ '24': 'Locking AndX Request (0x24)',
+ '2b': 'Echo Request (0x2b)',
+ '2e': 'Read AndX (0x2e)',
+ '2f': 'Write AndX Response (0x2f)',
+ 'a0': 'NT Trans (0xa0)',
+ 'a2': 'NT Create AndX (0xa2)',
+ 'a4': 'NT Cancel (0xa4)',
+ '71': 'Tree Disconnect (0x71)',
+ '72': 'Negotiate Protocol (0x72)',
+ '73': 'Session Setup AndX (0x73)',
+ '74': 'Logoff AndX (0x74)',
+ '75': 'Tree Connect AndX (0x75)',
+ },
+ 'smb.flags': { # first bit == 0 denotes request
+ True: 'response',
+ False: 'request',
+ },
+ 'nbns.flags': {
+ '0110': 'Name query',
+ '2810': 'Registration',
+ '2910': 'Registration (recursion)',
+ '3010': 'Release',
+ '8500': 'Response',
+ },
+ 'ntp.flags': {
+ '13': 'v2 client',
+ '19': 'v3 symmetric active',
+ '1b': 'v3 client',
+ '1c': 'v3 server',
+ '23': 'v4 client',
+ '24': 'v4 server',
+ '25': 'v4 broadcast',
+ 'd9': 'v3 symmetric active (unsynchronized, MAC)',
+ 'db': 'v3 client (unsynchronized)',
+ 'dc': 'v3 server (unsynchronized)',
+ 'e3': 'v4 client (unsynchronized, MAC)',
+ 'e4': 'v4 server (unsynchronized)',
+ 'e5': 'v4 broadcast (unsynchronized)',
+ },
+ # 'ntp.stratum': {
+ # '00': '',
+ # '03': '',
+ # '04': '',
+ # '05': '',
+ # '06': '',
+ # }
+ } # type: Dict[str, Dict[str, str]]
+
+
+
+class MessageTypeIdentifiers325(MessageTypeIdentifiers226):
+ """
+ Adaptation for tshark version > 3
+ """
+ FOR_PROTCOL = dict()
+ FOR_PROTCOL['dhcp'] = ['dhcp.option.dhcp']
+
+ NAMED_TYPES = dict() # type: Dict[str, Dict[str, str]]
+ NAMED_TYPES.update({
+ 'dhcp.option.dhcp': MessageTypeIdentifiers226.NAMED_TYPES['bootp.option.dhcp'],
+ })
+
+
class ParsingConstants(object):
+ def __init__(self):
+ compatibleProtocols = list(type(self).protocols(type(self).COMPATIBLE_TO))
+
+ # define copies of the class variables to use in the .
+ # The actual initialization with values should be done in the class definition.
+ self.TYPELOOKUP = type(self)._collect_typelookup() # type: Dict[str, str]
+ for p in compatibleProtocols:
+ self.TYPELOOKUP.update(p.TYPELOOKUP)
+ # names of field nodes in the json which should be ignored.
+ self.IGNORE_FIELDS = type(self)._collect_ignore_fields() \
+ + list(chain.from_iterable(p.IGNORE_FIELDS for p in compatibleProtocols))
+ self.EXCLUDE_SUB_FIELDS = type(self)._collect_exclude_sub_fields() \
+ + list(chain.from_iterable(p.EXCLUDE_SUB_FIELDS for p in compatibleProtocols))
+ self.INCLUDE_SUBFIELDS = type(self)._collect_include_subfields() \
+ + list(chain.from_iterable(p.INCLUDE_SUBFIELDS for p in compatibleProtocols))
+ self.INCLUDE_SUBFIELDS_RE = type(self)._collect_include_subfields_re() \
+ + list(chain.from_iterable(p.INCLUDE_SUBFIELDS_RE for p in compatibleProtocols))
+ """:type List[re.Pattern]"""
+ self.RECORD_STRUCTURE = type(self)._collect_record_structure() \
+ + list(chain.from_iterable(p.RECORD_STRUCTURE for p in compatibleProtocols))
+ self.prehooks = type(self)._collect_prehooks()
+ for p in compatibleProtocols:
+ self.prehooks.update(p.prehooks)
+ self.posthooks = type(self)._collect_posthooks()
+ for p in compatibleProtocols:
+ self.posthooks.update(p.posthooks)
+ self.MESSAGE_TYPE_IDS = type(self).MESSAGE_TYPE_IDS(compatibleProtocols) # type: MessageTypeIdentifiers
+
"""
Class to hold constants necessary for the interpretation of the tshark dissectors.
"""
+ COMPATIBLE_TO = b''
# see https://www.tcpdump.org/linktypes.html
LINKTYPES = {
- # 'NULL': 0,
- 'ETHERNET': 1,
- # IEEE802_5 = 6
- # PPP = 9
- 'RAW_IP': 101
- # IEEE802_11 = 105
+ 'undecoded' : -1, # added to represent a non-decoded raw trace without link type information
+ 'NULL': 0, # pcapy.DLT_NULL
+ 'ETHERNET': 1, # pcapy.DLT_EN10MB
+ 'IEEE802_5': 6, # pcapy.DLT_IEEE802
+ 'PPP': 9, # pcapy.DLT_PPP
+ 'RAW_IP': 101, # pcapy.DLT_RAW = 12 !!!
+ 'IEEE802_11': 105, # pcapy.DLT_IEEE802_11
+ 'RadioTap': 23,
+ 'IEEE802_11_RADIO': 127,
}
-
- # mapping of field names to general value types.
TYPELOOKUP = {'delimiter': 'chars',
'data.data': 'unknown'}
- """:type: Dict[str, str]"""
+ """
+ mapping of field names to general value types.
+ see also Wireshark dissector reference: https://www.wireshark.org/docs/dfref/
+ :type: Dict[str, str]
+ """
+
+ IGNORE_FIELDS = list()
+ EXCLUDE_SUB_FIELDS = list()
+ """ a convenience list for debugging: names of fields that need not give a warning if ignored. """
+
+ INCLUDE_SUBFIELDS = list()
+ """names of field nodes in the json which should be descended into."""
+
+ INCLUDE_SUBFIELDS_RE = list()
+ """regexes of field nodes in the json which should be descended into."""
+
+ RECORD_STRUCTURE = list()
+ """
+ names of field nodes in the json that have a
+ record structure (list[list[tuples], not list[tuples[str, tuple]]).
+ """
+
+ # HOOKS register. See :func:`walkSubTree()`.
+ prehooks = dict()
+ posthooks = dict()
+ MESSAGE_TYPE_IDS = MessageTypeIdentifiers # type: Type[MessageTypeIdentifiers]
+
+ @classmethod
+ def _collect_ignore_fields(cls):
+ supers = cls.mro()
+
+ retList = list()
+ for pc in supers:
+ if issubclass(pc, ParsingConstants):
+ retList += pc.IGNORE_FIELDS
+ return retList
+
+ @classmethod
+ def _collect_exclude_sub_fields(cls):
+ supers = cls.mro()
+
+ retList = list()
+ for pc in supers:
+ if issubclass(pc, ParsingConstants):
+ retList += pc.EXCLUDE_SUB_FIELDS
+ return retList
+
+ @classmethod
+ def _collect_include_subfields(cls):
+ supers = cls.mro()
+
+ retList = list()
+ for pc in supers:
+ if issubclass(pc, ParsingConstants):
+ retList += pc.INCLUDE_SUBFIELDS
+ return retList
+
+ @classmethod
+ def _collect_include_subfields_re(cls) -> List[re.Pattern]:
+ supers = cls.mro()
+
+ retList = list()
+ for pc in supers:
+ if issubclass(pc, ParsingConstants):
+ retList += pc.INCLUDE_SUBFIELDS_RE
+ return retList
+
+ @classmethod
+ def _collect_record_structure(cls):
+ supers = cls.mro()
+
+ retList = list()
+ for pc in supers:
+ if issubclass(pc, ParsingConstants):
+ retList += pc.RECORD_STRUCTURE
+ return retList
+
+ @classmethod
+ def _collect_typelookup(cls):
+ supers = cls.mro()
+
+ retList = dict()
+ for pc in supers:
+ if issubclass(pc, ParsingConstants):
+ retList.update(pc.TYPELOOKUP)
+ return retList
+
+ @classmethod
+ def _collect_prehooks(cls):
+ supers = cls.mro()
+
+ retList = dict()
+ for pc in supers:
+ if issubclass(pc, ParsingConstants):
+ retList.update(pc.prehooks)
+ return retList
+
+ @classmethod
+ def _collect_posthooks(cls):
+ supers = cls.mro()
+
+ retList = dict()
+ for pc in supers:
+ if issubclass(pc, ParsingConstants):
+ retList.update(pc.posthooks)
+ return retList
+
+ @staticmethod
+ def protocols(compatibleTo: bytes) -> Generator[Type['ParsingConstants'], None, None]:
+ from os.path import dirname
+ import pkgutil
+ from importlib import import_module
+
+ pkgpath = dirname(protocols.__file__)
+ modules = [name for _, name, _ in pkgutil.iter_modules([pkgpath], ".protocols.")]
+
+ importedProtocols = list()
+ for protomod in modules:
+ importedProtocols.append(
+ import_module(protomod, package='.'.join(__name__.split('.')[:-1])) # nemere.validation.__name__
+ )
+
+ for improt in importedProtocols:
+ for name, obj in inspect.getmembers(improt):
+ if inspect.isclass(obj) and issubclass(obj, ParsingConstants) and obj != ParsingConstants and \
+ obj.COMPATIBLE_TO == compatibleTo:
+ yield obj
+
# noinspection PyDictCreation
@@ -42,8 +406,8 @@ class ParsingConstants226(ParsingConstants):
TODO Determine up to which exact tshark version this JSON output format is used.
"""
-
COMPATIBLE_TO = b'2.2.6'
+ MESSAGE_TYPE_IDS = MessageTypeIdentifiers226
# names of field nodes in the json which should be ignored.
IGNORE_FIELDS = [
@@ -60,7 +424,7 @@ class ParsingConstants226(ParsingConstants):
'ntlmssp.challenge.target_info_raw'
]
- EXCLUDE_SUB_FIELDS = [ # a convenience list for debugging: names of fields that need not give a warning if ignored.
+ EXCLUDE_SUB_FIELDS = [
'dns.flags_tree', 'ntp.flags_tree',
'bootp.flags_tree', 'bootp.fqdn.flags_tree', 'bootp.secs_tree',
'smb.flags_tree', 'smb.flags2_tree', 'smb.sm_tree', 'smb.server_cap_tree',
@@ -148,7 +512,7 @@ class ParsingConstants226(ParsingConstants):
TYPELOOKUP['bootp.hw.len'] = 'int'
TYPELOOKUP['bootp.hops'] = 'int'
TYPELOOKUP['bootp.id'] = 'id'
- TYPELOOKUP['bootp.secs'] = 'int'
+ TYPELOOKUP['bootp.secs'] = 'int_le'
TYPELOOKUP['bootp.flags'] = 'flags'
TYPELOOKUP['bootp.ip.client'] = 'ipv4'
TYPELOOKUP['bootp.ip.your'] = 'ipv4'
@@ -240,16 +604,16 @@ class ParsingConstants226(ParsingConstants):
TYPELOOKUP['smb.nt_status'] = 'int' # has value: 00000000
TYPELOOKUP['smb.flags'] = 'flags' # has value: 18
TYPELOOKUP['smb.flags2'] = 'flags' # has value: 07c8
- TYPELOOKUP['smb.pid.high'] = 'int' # has value: 0000
- TYPELOOKUP['smb.signature'] = 'checksum' # has value: 4253525350594c20
+ TYPELOOKUP['smb.pid.high'] = 'id' # has value: 0000
+ TYPELOOKUP['smb.signature'] = 'crypto' # has value: 4253525350594c20
TYPELOOKUP['smb.reserved'] = 'int' # has value: 0000
- TYPELOOKUP['smb.tid'] = 'flags' # originally was 'id' but behaves like flags # has value: 0000
- TYPELOOKUP['smb.pid'] = 'flags' # originally was 'id' but behaves like flags # has value: fffe
- TYPELOOKUP['smb.uid'] = 'flags' # originally was 'id' but behaves like flags # has value: 0000
- TYPELOOKUP['smb.mid'] = 'flags' # originally was 'id' but behaves like flags # has value: 4000
+ TYPELOOKUP['smb.tid'] = 'id' # 'id' behaves like flags # has value: 0000
+ TYPELOOKUP['smb.pid'] = 'id' # 'id' behaves like flags # has value: fffe
+ TYPELOOKUP['smb.uid'] = 'id' # 'id' behaves like flags # has value: 0000
+ TYPELOOKUP['smb.mid'] = 'id' # 'id' behaves like flags # has value: 4000
# nbns
- TYPELOOKUP['nbns.id'] = 'int'
+ TYPELOOKUP['nbns.id'] = 'id'
TYPELOOKUP['nbns.flags'] = 'flags' # has value: 0110
TYPELOOKUP['nbns.count.queries'] = 'int' # has value: 0001
TYPELOOKUP['nbns.count.answers'] = 'int' # has value: 0000
@@ -267,58 +631,58 @@ class ParsingConstants226(ParsingConstants):
TYPELOOKUP['nbss.type'] = 'id' # has value: 00
TYPELOOKUP['nbss.length'] = 'int' # has value: 000038
TYPELOOKUP['smb.wct'] = 'int' # has value: 07
- TYPELOOKUP['smb.andxoffset'] = 'int' # has value: 3800 - little endian
- TYPELOOKUP['smb.connect.support'] = 'int' # has value: 0100
- TYPELOOKUP['smb.bcc'] = 'int' # has value: 0700 (Byte count)
+ TYPELOOKUP['smb.andxoffset'] = 'int_le' # has value: 3800 - little endian
+ TYPELOOKUP['smb.connect.support'] = 'int_le' # has value: 0100
+ TYPELOOKUP['smb.bcc'] = 'int_le' # has value: 0700 (Byte count)
TYPELOOKUP['smb.service'] = 'enum' # its coded as 8 bit ASCII 'chars', e.g: 49504300 - http://ubiqx.org/cifs/Book.html p. 311
TYPELOOKUP['smb.native_fs'] = 'chars' # has value: 0000
- TYPELOOKUP['smb.tpc'] = 'int' # has value: 1a00
- TYPELOOKUP['smb.tdc'] = 'int' # has value: 0000
- TYPELOOKUP['smb.mpc'] = 'int' # has value: 0800
- TYPELOOKUP['smb.mdc'] = 'int' # has value: 6810
+ TYPELOOKUP['smb.tpc'] = 'int_le' # has value: 1a00
+ TYPELOOKUP['smb.tdc'] = 'int_le' # has value: 0000
+ TYPELOOKUP['smb.mpc'] = 'int_le' # has value: 0800
+ TYPELOOKUP['smb.mdc'] = 'int_le' # has value: 6810
TYPELOOKUP['smb.msc'] = 'int' # has value: 00
TYPELOOKUP['smb.transaction.flags'] = 'flags' # has value: 0000
- TYPELOOKUP['smb.timeout'] = 'int' # has value: 88130000
- TYPELOOKUP['smb.pc'] = 'int' # has value: 1a00
- TYPELOOKUP['smb.po'] = 'int' # has value: 5c00
- TYPELOOKUP['smb.dc'] = 'int' # has value: 0000
- TYPELOOKUP['smb.data_offset'] = 'int' # has value: 0000
+ TYPELOOKUP['smb.timeout'] = 'int_le' # has value: 88130000
+ TYPELOOKUP['smb.pc'] = 'int_le' # has value: 1a00
+ TYPELOOKUP['smb.po'] = 'int_le' # has value: 5c00
+ TYPELOOKUP['smb.dc'] = 'int_le' # has value: 0000
+ TYPELOOKUP['smb.data_offset'] = 'int_le' # has value: 0000
TYPELOOKUP['smb.sc'] = 'int' # has value: 00
TYPELOOKUP['smb.trans_name'] = 'chars' # has value: 5c0050004900500045005c004c0041004e004d0041004e000000
- TYPELOOKUP['smb.padding'] = 'chars' # has value: 0000
- TYPELOOKUP['smb.pd'] = 'int' # has value: 0000
- TYPELOOKUP['smb.data_disp'] = 'int' # has value: 0000
- TYPELOOKUP['lanman.status'] = 'int' # has value: 0000
- TYPELOOKUP['lanman.convert'] = 'int' # has value: 3f0f
- TYPELOOKUP['lanman.entry_count'] = 'int' # has value: 0b00
- TYPELOOKUP['lanman.available_count'] = 'int' # has value: 0b00
+ TYPELOOKUP['smb.padding'] = 'pad' # has value: 0000
+ TYPELOOKUP['smb.pd'] = 'int_le' # has value: 0000
+ TYPELOOKUP['smb.data_disp'] = 'int_le' # has value: 0000
+ TYPELOOKUP['lanman.status'] = 'int_le' # has value: 0000
+ TYPELOOKUP['lanman.convert'] = 'int_le' # has value: 3f0f
+ TYPELOOKUP['lanman.entry_count'] = 'int_le' # has value: 0b00
+ TYPELOOKUP['lanman.available_count'] = 'int_le' # has value: 0b00
TYPELOOKUP['lanman.server.name'] = 'chars' # has value: 44432d424c5545000000000000000000
TYPELOOKUP['lanman.server.major'] = 'int' # has value: 05
TYPELOOKUP['lanman.server.minor'] = 'int' # has value: 02
- TYPELOOKUP['browser.server_type'] = 'int' # has value: 2b108400
+ TYPELOOKUP['browser.server_type'] = 'int_le' # has value: 2b108400
TYPELOOKUP['lanman.server.comment'] = 'chars' # has value: 00
TYPELOOKUP['smb.ea.error_offset'] = 'int' # has value: 0000
- TYPELOOKUP['smb.create.time'] = 'timestamp' # has value: a34bd360ef84cc01
- TYPELOOKUP['smb.access.time'] = 'timestamp' # has value: a34bd360ef84cc01
- TYPELOOKUP['smb.last_write.time'] = 'timestamp' # has value: 2bd5dc60ef84cc01
- TYPELOOKUP['smb.change.time'] = 'timestamp' # has value: 2bd5dc60ef84cc01
+ TYPELOOKUP['smb.create.time'] = 'timestamp_le' # has value: a34bd360ef84cc01
+ TYPELOOKUP['smb.access.time'] = 'timestamp_le' # has value: a34bd360ef84cc01
+ TYPELOOKUP['smb.last_write.time'] = 'timestamp_le' # has value: 2bd5dc60ef84cc01
+ TYPELOOKUP['smb.change.time'] = 'timestamp_le' # has value: 2bd5dc60ef84cc01
TYPELOOKUP['smb.file_attribute'] = 'flags' # has value: 26000000
TYPELOOKUP['smb.unknown_data'] = 'unknown' # has value: 00000000
TYPELOOKUP['smb.max_buf'] = 'int' # has value: 0411
- TYPELOOKUP['smb.max_mpx_count'] = 'int' # has value: 3200
+ TYPELOOKUP['smb.max_mpx_count'] = 'int_le' # has value: 3200
TYPELOOKUP['smb.vc'] = 'int' # has value: 0000
TYPELOOKUP['smb.session_key'] = 'bytes' # has value: 00000000
- TYPELOOKUP['smb.security_blob_len'] = 'int' # has value: 6b00
+ TYPELOOKUP['smb.security_blob_len'] = 'int_le' # has value: 6b00
TYPELOOKUP['smb.server_cap'] = 'flags' # has value: d4000080
TYPELOOKUP['smb.security_blob'] = 'bytes'
TYPELOOKUP['smb.native_os'] = 'chars'
TYPELOOKUP['smb.native_lanman'] = 'chars'
TYPELOOKUP['smb.primary_domain'] = 'chars' # has value: 0000
TYPELOOKUP['smb.trans2.cmd'] = 'id' # has value: 1000
- TYPELOOKUP['smb.max_referral_level'] = 'int' # has value: 0300
+ TYPELOOKUP['smb.max_referral_level'] = 'int_le' # has value: 0300
TYPELOOKUP['smb.file'] = 'chars' # has value: 5c0042004c005500450034000000
TYPELOOKUP['smb.setup.action'] = 'flags' # has value: 0000
- TYPELOOKUP['smb.file_name_len'] = 'int' # has value: 3000
+ TYPELOOKUP['smb.file_name_len'] = 'int_le' # has value: 3000
TYPELOOKUP['smb.create_flags'] = 'flags' # has value: 16000000
TYPELOOKUP['smb.rfid'] = 'id' # has value: 00000000
TYPELOOKUP['smb.access_mask'] = 'flags' # has value: 89000200
@@ -326,93 +690,327 @@ class ParsingConstants226(ParsingConstants):
TYPELOOKUP['smb.share_access'] = 'flags' # has value: 07000000
TYPELOOKUP['smb.create.disposition'] = 'flags' # has value: 01000000
TYPELOOKUP['smb.create_options'] = 'flags' # has value: 40000000
- TYPELOOKUP['smb.impersonation.level'] = 'int' # has value: 02000000
+ TYPELOOKUP['smb.impersonation.level'] = 'int_le' # has value: 02000000
TYPELOOKUP['smb.security.flags'] = 'flags' # has value: 00
TYPELOOKUP['smb.connect.flags'] = 'flags' # has value: 0800
- TYPELOOKUP['smb.pwlen'] = 'int' # has value: 0100
+ TYPELOOKUP['smb.pwlen'] = 'int_le' # has value: 0100
TYPELOOKUP['smb.password'] = 'bytes' # has value: 00
TYPELOOKUP['smb.path'] = 'chars' # has value: 5c005c005700570057005c0049005000430024000000
TYPELOOKUP['nbss.continuation_data'] = 'bytes'
TYPELOOKUP['smb.volume.serial'] = 'bytes' # has value: eff27040
TYPELOOKUP['smb.volume.label.len'] = 'int' # has value: 00000000
- TYPELOOKUP['smb.qpi_loi'] = 'int' # has value: ec03
+ TYPELOOKUP['smb.qpi_loi'] = 'int_le' # has value: ec03
TYPELOOKUP['smb.oplock.level'] = 'int' # has value: 02
- TYPELOOKUP['smb.fid'] = 'int' # has value: 07c0
+ TYPELOOKUP['smb.fid'] = 'id' # (int_le) has value: 07c0
TYPELOOKUP['smb.create.action'] = 'flags' # has value: 01000000
TYPELOOKUP['smb.end_of_file'] = 'bytes' # has value: 6b00000000000000
- TYPELOOKUP['smb.file_type'] = 'int' # has value: 0000
+ TYPELOOKUP['smb.file_type'] = 'int_le' # has value: 0000
TYPELOOKUP['smb.ipc_state'] = 'flags' # has value: 0700
TYPELOOKUP['smb.is_directory'] = 'flags' # has value: 00
- TYPELOOKUP['smb.volume_guid'] = 'id' # has value: 00000000000000000000000000000000
- TYPELOOKUP['smb.create.file_id_64b'] = 'id' # has value: 0000000000000000
+ TYPELOOKUP['smb.volume_guid'] = 'chars' # id, mostly uses utf8-chars, has value: 00000000000000000000000000000000
+ TYPELOOKUP['smb.create.file_id_64b'] = 'chars' # id, mostly uses utf8-chars, has value: 0000000000000000
TYPELOOKUP['smb.offset'] = 'int' # has value: 00000000
- TYPELOOKUP['smb.maxcount_low'] = 'int' # has value: 6b00
- TYPELOOKUP['smb.mincount'] = 'int' # has value: 6b00
- TYPELOOKUP['smb.maxcount_high'] = 'int' # has value: 00000000
- TYPELOOKUP['smb.remaining'] = 'int' # has value: 6b00
- TYPELOOKUP['smb.offset_high'] = 'int' # has value: 00000000
- TYPELOOKUP['smb.qfsi_loi'] = 'int' # has value: 0201
- TYPELOOKUP['smb.dialect.index'] = 'int' # has value: 0500
+ TYPELOOKUP['smb.maxcount_low'] = 'int_le' # has value: 6b00
+ TYPELOOKUP['smb.mincount'] = 'int_le' # has value: 6b00
+ TYPELOOKUP['smb.maxcount_high'] = 'int_le' # has value: 00000000
+ TYPELOOKUP['smb.remaining'] = 'int_le' # has value: 6b00
+ TYPELOOKUP['smb.offset_high'] = 'int_le' # has value: 00000000
+ TYPELOOKUP['smb.qfsi_loi'] = 'int_le' # has value: 0201
+ TYPELOOKUP['smb.dialect.index'] = 'int_le' # has value: 0500
TYPELOOKUP['smb.sm'] = 'id' # has value: 0f
- TYPELOOKUP['smb.max_vcs'] = 'int' # has value: 0100
- TYPELOOKUP['smb.max_bufsize'] = 'int' # has value: 04110000
- TYPELOOKUP['smb.max_raw'] = 'int' # has value: 00000100
- TYPELOOKUP['smb.system.time'] = 'timestamp' # has value: eec89f561287cc01
+ TYPELOOKUP['smb.max_vcs'] = 'int_le' # has value: 0100
+ TYPELOOKUP['smb.max_bufsize'] = 'int_le' # has value: 04110000
+ TYPELOOKUP['smb.max_raw'] = 'int_le' # has value: 00000100
+ TYPELOOKUP['smb.system.time'] = 'timestamp_le' # has value: eec89f561287cc01
TYPELOOKUP['smb.server_timezone'] = 'id' # has value: 88ff
TYPELOOKUP['smb.challenge_length'] = 'int' # has value: 00
TYPELOOKUP['smb.server_guid'] = 'id' # has value: 535ab176fc509c4697f4f3969e6c3d8d
TYPELOOKUP['smb.dialect'] = 'chars' # has value: 024e54204c4d20302e313200
TYPELOOKUP['smb.search.attribute'] = 'flags' # has value: 1600
- TYPELOOKUP['smb.search_count'] = 'int' # has value: 5605
+ TYPELOOKUP['smb.search_count'] = 'int_le' # has value: 5605
TYPELOOKUP['smb.find_first2.flags'] = 'flags' # has value: 0600
TYPELOOKUP['smb.ff2_loi'] = 'int' # has value: 0401
TYPELOOKUP['smb.storage_type'] = 'int' # has value: 00000000
TYPELOOKUP['smb.search_pattern'] = 'chars' # has value: 5c002a000000
- TYPELOOKUP['smb.index_number'] = 'int' # has value: 64bf000000000500
+ TYPELOOKUP['smb.index_number'] = 'int_le' # has value: 64bf000000000500
TYPELOOKUP['smb.dcm'] = 'flags' # has value: 0000
- TYPELOOKUP['smb.data_len_low'] = 'int' # has value: 6b00
- TYPELOOKUP['smb.data_len_high'] = 'int' # has value: 00000000
- TYPELOOKUP['smb.file_data'] = 'chars' # has value: b'[.ShellClassInfo]\r\nInfoTip...
- TYPELOOKUP['smb.count_low'] = 'int' # has value: 4800
- TYPELOOKUP['smb.count_high'] = 'int' # has value: 0000
+ TYPELOOKUP['smb.data_len_low'] = 'int_le' # has value: 6b00
+ TYPELOOKUP['smb.data_len_high'] = 'int_le' # has value: 00000000
+ TYPELOOKUP['smb.file_data'] = 'bytes' # sometimes chars, has value: b'[.ShellClassInfo]\r\nInfoTip...
+ TYPELOOKUP['smb.count_low'] = 'int_le' # has value: 4800
+ TYPELOOKUP['smb.count_high'] = 'int_le' # has value: 0000
TYPELOOKUP['smb.error_class'] = 'int' # has value: 00
- TYPELOOKUP['smb.error_code'] = 'int' # has value: 0000
+ TYPELOOKUP['smb.error_code'] = 'int_le' # has value: 0000
TYPELOOKUP['smb.fs_attr'] = 'int' # has value: ff002700
- TYPELOOKUP['smb.fs_max_name_len'] = 'int' # has value: ff000000
- TYPELOOKUP['smb.fs_name.len'] = 'int' # has value: 08000000
+ TYPELOOKUP['smb.fs_max_name_len'] = 'int_le' # has value: ff000000
+ TYPELOOKUP['smb.fs_name.len'] = 'int_le' # has value: 08000000
TYPELOOKUP['smb.fs_name'] = 'chars' # has value: 4e00540046005300
TYPELOOKUP['smb.extra_byte_parameters'] = 'chars' # has value: b'W\x00i\x00n\x00d\x00o\x00w\x00s\x00 \x00N\x00T...
- TYPELOOKUP['smb.ansi_pwlen'] = 'int' # has value: 0100
- TYPELOOKUP['smb.unicode_pwlen'] = 'int' # has value: 0000
+ TYPELOOKUP['smb.ansi_pwlen'] = 'int_le' # has value: 0100
+ TYPELOOKUP['smb.unicode_pwlen'] = 'int_le' # has value: 0000
TYPELOOKUP['smb.ansi_password'] = 'bytes' # has value: 00
TYPELOOKUP['smb.account'] = 'chars' # has value: 0000
- TYPELOOKUP['smb.nt.function'] = 'int' # has value: 0400
+ TYPELOOKUP['smb.nt.function'] = 'int_le' # has value: 0400
TYPELOOKUP['smb.nt.notify.completion_filter'] = 'flags' # has value: 17000000
TYPELOOKUP['smb.nt.notify.watch_tree'] = 'int' # has value: 00
- TYPELOOKUP['smb.challenge'] = 'bytes' # has value: 1340e2b3305971f8
+ TYPELOOKUP['smb.challenge'] = 'crypto' # (bytes) has value: 1340e2b3305971f8
TYPELOOKUP['smb.server'] = 'chars' # has value: 440043002d0042004c00550045000000
TYPELOOKUP['pad'] = 'pad' # has value: 000000
TYPELOOKUP['smb2.ioctl.function'] = 'enum' # has value: a8000900
TYPELOOKUP['smb.nt.ioctl.isfsctl'] = 'enum' # has value: 01
TYPELOOKUP['smb.nt.ioctl.completion_filter'] = 'flags' # has value: 00
- TYPELOOKUP['smb.echo.count'] = 'int' # has value: 0100
+ TYPELOOKUP['smb.echo.count'] = 'int_le' # has value: 0100
TYPELOOKUP['smb.echo.data'] = 'bytes' # has value: 4a6c4a6d4968436c42737200
- TYPELOOKUP['smb.echo.seq_num'] = 'int' # has value: 0100
+ TYPELOOKUP['smb.echo.seq_num'] = 'int_le' # has value: 0100
TYPELOOKUP['smb.lock.type'] = 'flags' # has value: 12
- TYPELOOKUP['smb.lock.length'] = 'int'
- TYPELOOKUP['smb.lock.offset'] = 'int'
+ TYPELOOKUP['smb.lock.length'] = 'int_le'
+ TYPELOOKUP['smb.lock.offset'] = 'int_le'
TYPELOOKUP['smb.locking.oplock.level'] = 'int' # has value: 01
- TYPELOOKUP['smb.locking.num_unlocks'] = 'int' # has value: 0000
- TYPELOOKUP['smb.locking.num_locks'] = 'int' # has value: 0000
+ TYPELOOKUP['smb.locking.num_unlocks'] = 'int_le' # has value: 0000
+ TYPELOOKUP['smb.locking.num_locks'] = 'int_le' # has value: 0000
TYPELOOKUP['smb.nt_transaction_setup'] = 'bytes' # has value: 0200644014000580
- TYPELOOKUP['smb2.ioctl.shadow_copy.num_volumes'] = 'int' # has value: 00000000
- TYPELOOKUP['smb2.ioctl.shadow_copy.num_labels'] = 'int' # has value: 00000000
- TYPELOOKUP['smb2.ioctl.shadow_copy.count'] = 'int' # has value: 02000000
+ TYPELOOKUP['smb2.ioctl.shadow_copy.num_volumes'] = 'int_le' # has value: 00000000
+ TYPELOOKUP['smb2.ioctl.shadow_copy.num_labels'] = 'int_le' # has value: 00000000
+ TYPELOOKUP['smb2.ioctl.shadow_copy.count'] = 'int_le' # has value: 02000000
TYPELOOKUP['smb.unicode_password'] = 'bytes'
TYPELOOKUP['smb.trans_data'] = 'bytes'
TYPELOOKUP['smb2.unknown'] = 'bytes' # has value: 0716
-# noinspection PyDictCreation
+
+ # TODO enable reuse by providing the original field name to each hook
+
+ # noinspection PyUnusedLocal
+ @staticmethod
+ def _hookAppendColon(value, siblings: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
+ """
+ Hook to return a colon as delimiter. See :func:`walkSubTree()`.
+
+ :param value: value of the field we are working on (str or list)
+ :param siblings: subfields that we know of by now
+ :return: tuple to add as new field
+ """
+ return [('delimiter', '3a'),]
+
+
+ # noinspection PyUnusedLocal
+ @staticmethod
+ def _hookAppendSpace(value, siblings) -> List[Tuple[str, str]]:
+ """
+ Hook to return a space as delimiter. See :func:`walkSubTree()`.
+
+ :param value: value of the field we are working on (str or list)
+ :param siblings: subfields that we know of by now
+ :type siblings: list[tuple[str, str]]
+ :return: tuple to add as new field
+ """
+ return [('delimiter', '20'),]
+
+
+ # noinspection PyUnusedLocal
+ @staticmethod
+ def _hookAppendColonSpace(value, siblings) -> List[Tuple[str, str]]:
+ """
+ Hook to return a colon and a space as 2-char delimiter. See :func:`walkSubTree()`.
+
+ :param value: value of the field we are working on (str or list)
+ :param siblings: subfields that we know of by now
+ :type siblings: list[tuple[str, str]]
+ :return: tuple to add as new field
+ """
+ return [('delimiter', '203a'),]
+
+
+ # noinspection PyUnusedLocal
+ @staticmethod
+ def _hookIRCemptyTrailer(value: str, siblings) -> Union[List[Tuple[str, str]], None]:
+ """
+ The silly IRC-dissector outputs no "_raw" value if a field is empty.
+ So we need to add the delimiter at least.
+
+ :param value: value of the leaf node we are working on
+ :param siblings: subfields that we know of by now
+ :type siblings: list[tuple[str, str]]
+ :return: tuple to add as new field
+ """
+ if len(value) == 0:
+ return [('delimiter', '203a'),]
+
+
+ # noinspection PyUnusedLocal
+ @staticmethod
+ def _hookAppendCRLF(value, siblings) -> List[Tuple[str, str]]:
+ """
+ Hook to return a carriage returne and line feed delimiter. See :func:`walkSubTree()`.
+
+ :param value: value of the field we are working on (str or list)
+ :param siblings: subfields that we know of by now
+ :type siblings: list[tuple[str, str]]
+ :return: tuple to add as new field
+ """
+ return [('delimiter', '0d0a'),]
+
+
+ # noinspection PyUnusedLocal
+ @staticmethod
+ def _hookAppendNetServerEnum2(value, siblings) -> None:
+ """
+ Hook to fail on LANMAN's Function Code: NetServerEnum2 (104).
+
+ See :func:`walkSubTree()`.
+
+ :param value: value of the field we are working on (str or list)
+ :param siblings: subfields that we know of by now
+ :type siblings: list[tuple[str, str]]
+ :return: tuple to add as new field
+ """
+ if value == '104': # Function Code: NetServerEnum2 (104)
+ raise NotImplementedError("LANMAN protocol's NetServerEnum2 not supported due to unparsed field at the end "
+ "of each Server entry in the tshark dissector.")
+ return None
+
+
+ # noinspection PyUnusedLocal
+ @staticmethod
+ def _hookAppendThreeZeros(value, siblings) -> List[Tuple[str, str]]:
+ """
+ Hook to return three zero bytes. See :func:`walkSubTree()`.
+
+ :param value: value of the field we are working on (str or list)
+ :param siblings: subfields that we know of by now
+ :type siblings: list[tuple[str, str]]
+ :return: tuple to add as new field
+ """
+ return [('delimiter', '000000'),]
+
+
+ # noinspection PyUnusedLocal
+ @staticmethod
+ def _hookRaiseNotImpl(value, siblings) -> List[Tuple[str, str]]:
+ """
+ Hook to fail in case a dissector lacks required field information.
+
+ See :func:`walkSubTree()`.
+
+ :param value: value of the field we are working on (str or list)
+ :param siblings: subfields that we know of by now
+ :type siblings: list[tuple[str, str]]
+ :return: tuple to add as new field
+ """
+ raise NotImplementedError("Not supported due to unparsed field in the tshark dissector.")
+
+
+ # noinspection PyUnusedLocal
+ @staticmethod
+ def _hookAppendFourZeros(value, siblings) -> List[Tuple[str, str]]:
+ """
+ Hook to return three zero bytes. See :func:`walkSubTree()`.
+
+ :param value: value of the field we are working on (str or list)
+ :param siblings: subfields that we know of by now
+ :type siblings: list[tuple[str, str]]
+ :return: tuple to add as new field
+ """
+ return [('delimiter', '00000000'),]
+
+
+ # noinspection PyUnusedLocal
+ @staticmethod
+ def _hookAppendUnknownTransParams(value, siblings) -> List[Tuple[str, str]]:
+ """
+ Hook to return the value of "Unknown Transaction2 Parameters". See :func:`walkSubTree()`.
+
+ :param value: value of the field we are working on (str or list)
+ :param siblings: subfields that we know of by now
+ :type siblings: list[tuple[str, str]]
+ :return: tuple to add as new field
+ """
+ return [('unknownTrans2params', value),]
+
+
+ # noinspection PyUnusedLocal
+ @staticmethod
+ def _hookAppendUnknownTransData(value, siblings) -> List[Tuple[str, str]]:
+ """
+ Hook to return the value of "Unknown Transaction2 Parameters". See :func:`walkSubTree()`.
+
+ :param value: value of the field we are working on (str or list)
+ :param siblings: subfields that we know of by now
+ :type siblings: list[tuple[str, str]]
+ :return: tuple to add as new field
+ """
+ return [('unknownTrans2data', value),]
+
+
+
+ @staticmethod
+ def _hookAppendUnknownTransReqBytes(value, siblings) -> Union[List[Tuple[str, str]], None]:
+ """
+ Hook to return the value of "Unknown Transaction2 Parameters". See :func:`walkSubTree()`.
+
+ :param value: value of the field we are working on (str or list)
+ :param siblings: subfields that we know of by now
+ :type siblings: list[tuple[str, str]]
+ :return: tuple to add as new field
+ """
+ if value == '00' and siblings[-1] == ('smb.sc', '03'):
+ return [('unknownTransReqBytes', '010001000200'),]
+
+
+ # noinspection PyUnusedLocal
+ @staticmethod
+ def _hookGssapi(value, siblings) -> List[Tuple[str, str]]:
+ """
+ Hook to return the value of "Unknown Transaction2 Parameters". See :func:`walkSubTree()`.
+
+ :param value: value of the field we are working on (str or list)
+ :param siblings: subfields that we know of by now
+ :type siblings: list[tuple[str, str]]
+ :return: tuple to add as new field
+ """
+ return [('gss-api', value[:8]),]
+
+ # noinspection PyUnusedLocal
+ @staticmethod
+ def _hookFirstByte(value: list, siblings: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
+ """
+ Hook to return the first byte of the given value for bootp.option.type
+
+ :param value: hex value of the field we are working on
+ :param siblings: subfields that we know of by now
+ :return: tuple of field name and value to add as new field
+ """
+ return [('bootp.option.type', value[:2]),]
+
+ # HOOKS register. See :func:`walkSubTree()`.
+ # noinspection PyUnresolvedReferences
+ prehooks = {
+ 'bootp.option.type_raw': _hookFirstByte.__func__,
+
+ 'irc.response.prefix_raw': _hookAppendColon.__func__,
+ 'irc.response.trailer_raw': _hookAppendColonSpace.__func__,
+ 'irc.response.trailer': _hookIRCemptyTrailer.__func__,
+ 'irc.request.prefix_raw': _hookAppendColon.__func__,
+ 'irc.request.trailer_raw': _hookAppendColonSpace.__func__,
+ 'irc.request.trailer': _hookIRCemptyTrailer.__func__,
+
+ 'gss-api_raw' : _hookGssapi.__func__,
+ 'ntlmssp.version.ntlm_current_revision_raw' : _hookAppendThreeZeros.__func__,
+ }
+ ## Basic handling of missing single delimiter characters is generalized by comparing the original message to the
+ ## concatenated dissector result. See :func:`_reassemblePostProcessing()
+ ## within :func:`_reassemblePostProcessing()`
+ # noinspection PyUnresolvedReferences
+ posthooks = {
+ 'lanman.function_code' : _hookAppendNetServerEnum2.__func__,
+ 'smb.dfs.referral.version' : _hookRaiseNotImpl.__func__,
+ 'dcerpc.cn_num_ctx_items' : _hookAppendThreeZeros.__func__,
+ 'Unknown Transaction2 Parameters' : _hookAppendUnknownTransParams.__func__,
+ 'Unknown Transaction2 Data' : _hookAppendUnknownTransData.__func__,
+ 'smb.reserved': _hookAppendUnknownTransReqBytes.__func__,
+ 'nbns.session_data_packet_size' : _hookAppendFourZeros.__func__,
+ }
+
+# noinspection PyDictCreation,PyAbstractClass
class ParsingConstants263(ParsingConstants226):
"""
Compatibility for tshark 2.6.3 to 2.6.5
@@ -429,139 +1027,119 @@ class ParsingConstants263(ParsingConstants226):
"""
COMPATIBLE_TO = b'2.6.5'
- pass
-
-class MessageTypeIdentifiers(object):
- # fields or combinations of field that identify a message type for a specific protocol
- FOR_PROTCOL = {
- 'bootp' : ['bootp.option.dhcp'],
- 'dns' : ['dns.flags', 'dns.qry.type'],
- 'nbns' : ['nbns.flags'],
- 'nbss' : ['nbss.type', {
- 'field': 'smb.cmd',
- 'filter': lambda v: v != 'ff',
- 'select': lambda w: w
- }, {
- 'field': 'smb.flags',
- 'filter': lambda v: True,
- 'select': lambda w: (int.from_bytes(bytes.fromhex(w), "big") & 128) != 0 # first bit denotes request/response
- }],
- 'ntp' : ['ntp.flags', 'ntp.stratum']
- }
+# noinspection PyAbstractClass
+class ParsingConstants325(ParsingConstants263):
+ """
+ Compatibility for tshark 3.2.5
- NAMED_TYPES = { # assumes hex bytes are lower-case
- 'bootp.option.dhcp' : {
- '01': 'Discover',
- '02': 'Offer',
- '03': 'Request',
- '04': 'Decline',
- '05': 'ACK',
- '07': 'Release',
- '08': 'Inform',
- },
- 'nbss.type' : {
- '00': 'SMB'
- },
- 'dns.flags' : {
- '0100': 'Standard query',
- '8182': 'Response (failure)',
- '8183': 'Response (no such name)',
- '8580': 'Response (success)',
- },
- 'dns.qry.type': {
- '0001': 'A',
- '0002': 'NS',
- '0010': 'TXT',
- '001c': 'AAAA',
- '000f': 'MX',
- '000c': 'PTR',
- '0006': 'SOA',
- '0021': 'SRV',
- },
- 'smb.cmd': {
- '04': 'Close (0x04)',
- '24': 'Locking AndX Request (0x24)',
- '2b': 'Echo Request (0x2b)',
- '2e': 'Read AndX (0x2e)',
- '2f': 'Write AndX Response (0x2f)',
- 'a0': 'NT Trans (0xa0)',
- 'a2': 'NT Create AndX (0xa2)',
- 'a4': 'NT Cancel (0xa4)',
- '71': 'Tree Disconnect (0x71)',
- '72': 'Negotiate Protocol (0x72)',
- '73': 'Session Setup AndX (0x73)',
- '74': 'Logoff AndX (0x74)',
- '75': 'Tree Connect AndX (0x75)',
- },
- 'smb.flags': { # first bit == 0 denotes request
- True: 'response',
- False: 'request',
- },
- 'nbns.flags': {
- '0110': 'Name query',
- '2810': 'Registration',
- '2910': 'Registration (recursion)',
- '3010': 'Release',
- '8500': 'Response',
- },
- 'ntp.flags': {
- '13': 'v2 client',
- '19': 'v3 symmetric active',
- '1b': 'v3 client',
- '1c': 'v3 server',
- '23': 'v4 client',
- '24': 'v4 server',
- '25': 'v4 broadcast',
- 'd9': 'v3 symmetric active (unsynchronized, MAC)',
- 'db': 'v3 client (unsynchronized)',
- 'dc': 'v3 server (unsynchronized)',
- 'e3': 'v4 client (unsynchronized, MAC)',
- 'e4': 'v4 server (unsynchronized)',
- 'e5': 'v4 broadcast (unsynchronized)',
- },
- # 'ntp.stratum': {
- # '00': '',
- # '03': '',
- # '04': '',
- # '05': '',
- # '06': '',
- # }
- }
+ TODO Determine starting from which exact tshark version this JSON output format is used.
+ """
+ COMPATIBLE_TO = b'3.2.5'
+ MESSAGE_TYPE_IDS = MessageTypeIdentifiers325
- @staticmethod
- def __resolveTypeName(fieldname: str, fieldvalue: str):
- return MessageTypeIdentifiers.NAMED_TYPES[fieldname][fieldvalue] \
- if fieldname in MessageTypeIdentifiers.NAMED_TYPES \
- and fieldvalue in MessageTypeIdentifiers.NAMED_TYPES[fieldname] \
- else "{}={}".format(fieldname, fieldvalue)
+ IGNORE_FIELDS = [
+ 'dhcp.option.type_raw', 'dhcp.option.value_raw', 'dhcp.option.end_raw',
+
+ # 'wlan.fc_raw', 'wlan.duration_raw', 'wlan.ra_raw', 'wlan.ta_raw', 'wlan.bssid_raw', 'wlan.frag_raw', 'wlan.seq_raw',
+ #
+ # 'wlan.fc.type_subtype', 'wlan.fc', 'wlan.fc_tree', 'wlan.duration', 'wlan.ra', 'wlan.ra_resolved',
+ # 'wlan.addr', 'wlan.addr_resolved', 'wlan.da', 'wlan.da_resolved', 'wlan.ta', 'wlan.ta_resolved',
+ # 'wlan.sa', 'wlan.sa_resolved', 'wlan.bssid', 'wlan.bssid_resolved', 'wlan.addr', 'wlan.addr_resolved',
+ # 'wlan.fcs.status', 'wlan.fcs', 'wlan.frag', 'wlan.seq',
+ 'wlan.fc.type_subtype_raw',
+ 'wlan.ra_resolved_raw',
+ 'wlan.addr_raw', 'wlan.addr_resolved_raw', 'wlan.da_raw',
+ 'wlan.da_resolved_raw', 'wlan.ta_resolved_raw',
+ 'wlan.sa_raw', 'wlan.sa_resolved_raw',
+ 'wlan.bssid_resolved_raw', 'wlan.addr_raw',
+ 'wlan.addr_resolved_raw',
+ ]
+
+ # a convenience list for debugging: names of fields that need not give a warning if ignored.
+ EXCLUDE_SUB_FIELDS = [
+ 'dhcp.flags_tree', 'dhcp.fqdn.flags_tree', 'dhcp.secs_tree',
+
+ 'wlan.fc_tree', 'wlan.vht.capabilities_tree',
+ ]
+
+ INCLUDE_SUBFIELDS = [
+ 'dhcp.option.type_tree',
+ ]
+
+ RECORD_STRUCTURE = []
+
+ TYPELOOKUP = dict()
+ # dhcp
+ TYPELOOKUP['dhcp.type'] = 'flags' # or enum
+ TYPELOOKUP['dhcp.hw.type'] = 'flags' # or enum
+ TYPELOOKUP['dhcp.hw.len'] = 'int'
+ TYPELOOKUP['dhcp.hops'] = 'int'
+ TYPELOOKUP['dhcp.id'] = 'id'
+ TYPELOOKUP['dhcp.secs'] = 'int_le'
+ TYPELOOKUP['dhcp.flags'] = 'flags'
+ TYPELOOKUP['dhcp.ip.client'] = 'ipv4'
+ TYPELOOKUP['dhcp.ip.your'] = 'ipv4'
+ TYPELOOKUP['dhcp.ip.server'] = 'ipv4'
+ TYPELOOKUP['dhcp.ip.relay'] = 'ipv4'
+ TYPELOOKUP['dhcp.hw.mac_addr'] = 'macaddr'
+ TYPELOOKUP['dhcp.hw.addr_padding'] = 'bytes'
+ TYPELOOKUP['dhcp.server'] = 'chars'
+ TYPELOOKUP['dhcp.file'] = 'chars'
+ TYPELOOKUP['dhcp.cookie'] = 'id' # changed from 'bytes'
+ TYPELOOKUP['dhcp.option.padding'] = 'pad'
+ TYPELOOKUP['dhcp.option.type'] = 'enum' # special prehook since the dissector returns the whole option!
+ # bootp.option.type_tree is walked from there!
+ TYPELOOKUP['dhcp.option.length'] = 'int' # has value: 01
+ TYPELOOKUP['dhcp.option.dhcp'] = 'enum' # has value: 03
+ TYPELOOKUP['dhcp.option.hostname'] = 'chars' # has value: 4f66666963653131
+ TYPELOOKUP['dhcp.fqdn.flags'] = 'flags' # uint; has value: 00
+ TYPELOOKUP['dhcp.fqdn.rcode1'] = 'enum' # uint; has value: 00
+ TYPELOOKUP['dhcp.fqdn.rcode2'] = 'enum' # uint; has value: 00
+ TYPELOOKUP['dhcp.fqdn.name'] = 'chars' # has value: 4f666669636531312e626c7565322e6578
+ TYPELOOKUP['dhcp.option.vendor_class_id'] = 'chars' # has value: 4d53465420352e30
+ TYPELOOKUP['dhcp.option.vendor.value'] = 'bytes' # has value: 5e00
+ TYPELOOKUP['dhcp.option.request_list_item'] = 'enum' # uint; has value: 01
+ TYPELOOKUP['dhcp.option.broadcast_address'] = 'ipv4' # has value: ac1203ff
+ TYPELOOKUP['dhcp.option.dhcp_server_id'] = 'ipv4' # has value: ac120301
+ TYPELOOKUP['dhcp.option.ip_address_lease_time'] = 'int' # uint; has value: 00000e10
+ TYPELOOKUP['dhcp.option.renewal_time_value'] = 'int' # uint; has value: 00000696
+ TYPELOOKUP['dhcp.option.rebinding_time_value'] = 'int' # uint; has value: 00000bdc
+ TYPELOOKUP['dhcp.option.subnet_mask'] = 'ipv4' # has value: ffffff00
+ TYPELOOKUP['dhcp.option.broadcast_address'] = 'ipv4' # has value: ac1203ff
+ TYPELOOKUP['dhcp.option.router'] = 'ipv4' # has value: ac120301
+ TYPELOOKUP['dhcp.option.domain_name_server'] = 'ipv4' # has value: ac120301
+ TYPELOOKUP['dhcp.option.domain_name'] = 'chars' # has value: 626c7565332e6578
+ TYPELOOKUP['dhcp.option.requested_ip_address'] = 'ipv4' # has value: 0a6e30d8
+ TYPELOOKUP['dhcp.option.dhcp_max_message_size'] = 'int' # uint; has value: 04ec
+ TYPELOOKUP['dhcp.client_id.uuid'] = 'id' # has value: 00000000000000000000000000000000
+ TYPELOOKUP['dhcp.option.ntp_server'] = 'ipv4' # has value: c0a800c8
+ # these may be behaving like flags
+ TYPELOOKUP['dhcp.option.client_system_architecture'] = 'enum' # has value: 0000
+ TYPELOOKUP['dhcp.client_network_id_major'] = 'int' # version number; has value: 02
+ TYPELOOKUP['dhcp.client_network_id_minor'] = 'int' # version number; has value: 01
+ TYPELOOKUP['dhcp.option.dhcp_auto_configuration'] = 'enum' # has value: 01
+ TYPELOOKUP['dhcp.option.message'] = 'chars'
+
+ # wlan.mgt
+ TYPELOOKUP['wlan.fixed.category_code'] = 'enum' # has value: 7f
+ TYPELOOKUP['wlan.tag.oui'] = 'addr' # has value: 0017f2
+ # noinspection PyUnusedLocal
@staticmethod
- def typeOfMessage(message: 'ParsedMessage'):
- if message.protocolname in MessageTypeIdentifiers.FOR_PROTCOL:
- idFields = MessageTypeIdentifiers.FOR_PROTCOL[message.protocolname]
- resolvedTypeName = []
- for ifield in idFields:
- if isinstance(ifield, dict): # complex type identifiers with filter and selector
- ifv = message.getValuesByName(ifield['field'])
- if not ifv:
- continue # to next field
- for idvalue in ifv:
- if ifield['filter'](idvalue):
- selectedid = ifield['select'](idvalue)
- resolvedTypeName.append(
- MessageTypeIdentifiers.__resolveTypeName(ifield['field'], selectedid))
- else: # simple identifier
- selectedid = message.getValuesByName(ifield)
- for ifv in selectedid:
- # noinspection PyTypeChecker
- resolvedTypeName.append(MessageTypeIdentifiers.__resolveTypeName(ifield, ifv))
- if len(resolvedTypeName) > 0:
- return ":".join(resolvedTypeName)
+ def _hookFirstByte(value: list, siblings: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
+ """
+ Hook to return the first byte of the given value for bootp.option.type
- # message identifier not known (outer if-statement)
- # or filter never matched (if-statement inside dict handling branch)
- raise Exception("No message type identifier known for protocol {}".format(message.protocolname))
+ :param value: hex value of the field we are working on
+ :param siblings: subfields that we know of by now
+ :return: tuple of field name and value to add as new field
+ """
+ return [('dhcp.option.type', value[:2]),]
+
+ # noinspection PyUnresolvedReferences
+ prehooks = {'dhcp.option.type_raw': _hookFirstByte.__func__}
+ # posthooks = {}
class DissectionInvalidError(Exception):
@@ -704,8 +1282,9 @@ def _parseMultiple(messages: List[RawMessage], target = None, layer=-1, relative
"""
Bulk create ParsedMessages in one tshark run for better performance.
+ >>> # noinspection PyUnresolvedReferences
>>> from netzob.all import *
- >>> from validation.messageParser import ParsedMessage
+ >>> from nemere.validation.messageParser import ParsedMessage
>>> # pkt = PCAPImporter.readFile("../input/irc_ictf2010-42_deduped-100.pcap", importLayer=1).values()
>>> pkt = PCAPImporter.readFile("../input/dns_ictf2010_deduped-100.pcap", importLayer=1).values()
>>> pms = ParsedMessage.parseMultiple(pkt)
@@ -762,7 +1341,6 @@ def _parseMultiple(messages: List[RawMessage], target = None, layer=-1, relative
prsdmsgs.update(ParsedMessage._parseMultiple(msgChunk, target, layer, relativeToIP,
failOnUndissectable, linktype))
print("Stopped for raised exception:", e)
- # IPython.embed()
# Parse JSON:
try:
@@ -848,17 +1426,35 @@ def _parseJSON(self, dissectjson: List[Tuple[str, any]]):
if self.relativeToIP and 'ip' not in self.protocols:
errortext = "No IP layer could be identified in a message of the trace."
raise DissectionTemporaryFailure(errortext)
- absLayNum = (self.layernumber if self.layernumber >= 0 else len(self.protocols) - 1) \
- if not self.relativeToIP else (self.protocols.index('ip') + self.layernumber)
+ if not self.relativeToIP:
+ baselayer = 0 if 'radiotap' not in self.protocols else self.protocols.index('radiotap') + 1
+ absLayNum = (baselayer + self.layernumber) if self.layernumber >= 0 else len(self.protocols) - 1
+ else:
+ absLayNum = self.protocols.index('ip') + self.layernumber
try:
# protocolname is e.g. 'ntp'
self.protocolname = self.protocols[absLayNum]
except IndexError as e:
- ptxt = " ".join([absLayNum, 'missing in', self.protocols])
- print(ptxt)
- subprocess.run(["spd-say", ptxt])
- IPython.embed()
- raise e
+ # there is a bug in the wlan.mgt/awdl dissector: "sometimes" it doesn't list any layer
+ # above wlan in the protocols value of frame (see `self.protocols`), so we check for
+ # existing layers in the keys of `layersvalue` that have no "_raw" suffix (see
+ # `ParsedMessage.RK`)
+ rawProtocols = [lv[0] for lv in layersvalue
+ if not lv[0].endswith(ParsedMessage.RK) and lv[0] != framekey]
+ # make sure everything up to the last protocol listed in self.protocols is also contained
+ # in rawProtocols then replace the self.protocols list with the manually determined one.
+ if rawProtocols[:len(self.protocols)] == self.protocols:
+ self.protocols = rawProtocols
+ self.protocolname = self.protocols[absLayNum]
+ else:
+ ptxt = f"{absLayNum} missing in {self.protocols}"
+ print(ptxt)
+ try:
+ subprocess.run(["spd-say", ptxt])
+ except FileNotFoundError:
+ pass # does not matter, there simply is no speech notification
+ IPython.embed()
+ raise e
self._dissectfull = ParsedMessage._getElementByName(layersvalue, self.protocolname)
# add missing layers in protocols list
@@ -874,6 +1470,12 @@ def _parseJSON(self, dissectjson: List[Tuple[str, any]]):
self.protocols += missingLayers
break
+ # Only 'frame_raw' is guaranteed to all the bytes. Thus should we use this value??
+ self.protocolbytes = ParsedMessage._getElementByName(layersvalue,
+ self.protocolname + ParsedMessage.RK) # tshark 2.2.6
+ if isinstance(self.protocolbytes, list): # tshark 2.6.3
+ self.protocolbytes = self.protocolbytes[0]
+
# what to do with layers after (embedded in) the target protocol
if absLayNum < len(self.protocols):
for embedded in self.protocols[absLayNum+1 : ]:
@@ -887,6 +1489,10 @@ def _parseJSON(self, dissectjson: List[Tuple[str, any]]):
if not isinstance(self._dissectfull, list):
print ("Undifferentiated protocol content for protocol ", self.protocolname,
"\nDissector JSON is: ", self._dissectfull)
+ try:
+ subprocess.run(["spd-say", "'Undifferenzierter Protokolinhalt!'"])
+ except FileNotFoundError:
+ pass # does not matter, there simply is no speech notification
IPython.embed() # TODO how to handle this in general without the need for interaction?
raise DissectionInsufficient("Undifferentiated protocol content for protocol ", self.protocolname,
"\nDissector JSON is: ", self._dissectfull)
@@ -902,16 +1508,12 @@ def _parseJSON(self, dissectjson: List[Tuple[str, any]]):
raise DissectionInsufficient(
"Incomplete dissection. Probably wrong base encapsulation detected?")
- self.protocolbytes = ParsedMessage._getElementByName(layersvalue,
- self.protocolname + ParsedMessage.RK) # tshark 2.2.6
- if isinstance(self.protocolbytes, list): # tshark 2.6.3
- self.protocolbytes = self.protocolbytes[0]
# field keys, filter for those ending with '_raw' into fieldnames
- self._fieldsflat = ParsedMessage._walkSubTree(self._dissectfull)
+ self._fieldsflat = ParsedMessage.walkSubTree(self._dissectfull)
try:
ParsedMessage._reassemblePostProcessing(self)
except DissectionIncomplete as e:
- print("Known message dissection is", ", ".join(ds[0] for ds in dissectsub))
+ print("Known message dissection is", ", ".join(ds[0] for ds in self._dissectfull))
print("Too long unknown message trail found. Rest is:", e.rest,
"\nfor Wireshark filter: {}".format(':'.join([b1 + b2 for b1, b2 in
zip(self.protocolbytes[::2],
@@ -922,10 +1524,34 @@ def _parseJSON(self, dissectjson: List[Tuple[str, any]]):
# validate dissection
if not "".join(self.getFieldValues()) == self.protocolbytes:
- print("Known message dissection is", ", ".join(ds[0] for ds in dissectsub))
- print('Dissection is incomplete:\nDissector result:',
- '{}\nOriginal packet: {}\nself is of type ParsedMessage'.format(
- "".join(self.getFieldValues()), self.protocolbytes))
+ from tabulate import tabulate
+ import difflib
+ import nemere.visualization.bcolors as bcolors
+ from textwrap import wrap
+ print("\n Known message dissection is") #, ", ".join(ds[0] for ds in dissectsub))
+ print(tabulate(zip(self.getFieldNames(), self.getFieldValues())))
+ diffgen = difflib.ndiff(wrap("".join(self.getFieldValues()),2), wrap(self.protocolbytes,2))
+ fieldvaluesColored = ""
+ protobytesColored = ""
+ for diffchar in diffgen:
+ if diffchar[0] == "+":
+ protobytesColored += bcolors.colorizeStr(diffchar[2:], 10)
+ if diffchar[0] == "-":
+ fieldvaluesColored += bcolors.colorizeStr(diffchar[2:], 10)
+ if diffchar[0] == " ":
+ protobytesColored += diffchar[2:]
+ fieldvaluesColored += diffchar[2:]
+ print(
+ '\nDissection is incomplete. (Compare self.getFieldValues() and self.protocolbytes):'
+ '\nDissector result: {}\nOriginal packet: {}\n'.format(
+ fieldvaluesColored, protobytesColored)
+ )
+ print('self is of type ParsedMessage\n'
+ 'self._fieldsflat or self._dissectfull are interesting sometimes\n')
+ try:
+ subprocess.run(["spd-say", "'Dissection unvollständig!'"])
+ except FileNotFoundError:
+ pass # does not matter, there simply is no speech notification
IPython.embed()
raise DissectionIncomplete('Dissection is incomplete:\nDissector result: {}\n'
@@ -972,8 +1598,9 @@ def _reassemblePostProcessing(self):
# self.printUnknownTypes()
# pprint(self._dissectfull)
print()
- raise ValueError("Unparsed field found between field {} and {}. Value: {:s}".format(
- self.getFieldNames()[index - 1], self.getFieldNames()[index],
+ raise ValueError("Unparsed field found between field {} ({:s}) and {} ({:s}). Value: {:s}".format(
+ self.getFieldNames()[index - 1], self.getFieldValues()[index - 1],
+ self.getFieldNames()[index], self.getFieldValues()[index],
rest[:offset]) + "\nfor Wireshark filter: {}".format(':'.join([b1 + b2 for b1, b2 in
zip(self.protocolbytes[::2],
self.protocolbytes[1::2])]))
@@ -1027,7 +1654,7 @@ def _nodeValue(node) -> Tuple[int, Union[str, List]]:
@staticmethod
- def _walkSubTree(root: List[Tuple[str, any]], allSubFields=False) -> List[Tuple[str, str]]:
+ def walkSubTree(root: List[Tuple[str, any]], allSubFields=False) -> List[Tuple[str, str]]:
"""
Walk the tree structure of the tshark-json, starting from ``root`` and generate a flat representation
of the field sequence as it is in the message.
@@ -1046,23 +1673,32 @@ def _walkSubTree(root: List[Tuple[str, any]], allSubFields=False) -> List[Tuple[
nodetype, nodevalue = ParsedMessage._nodeValue(subnode)
# apply pre-hook if any for this field name
- if fieldkey in ParsedMessage._prehooks:
- ranPreHook = ParsedMessage._prehooks[fieldkey](nodevalue, subfields)
+ if fieldkey in CONSTANTS_CLASS.prehooks:
+ ranPreHook = CONSTANTS_CLASS.prehooks[fieldkey](nodevalue, subfields)
if ranPreHook is not None:
- subfields.append(ranPreHook)
+ subfields.extend(ranPreHook)
# append leaf data
if fieldkey.endswith(ParsedMessage.RK) and nodetype > 0:
if fieldkey not in CONSTANTS_CLASS.IGNORE_FIELDS:
- subfields.append((fieldkey[:-len(ParsedMessage.RK)],
- nodevalue))
+ # fix faulty dissector outputs
+ if len(nodevalue) % 2 != 0:
+ nodevalue = '0' + nodevalue
+ subfields.extend([(fieldkey[:-len(ParsedMessage.RK)],
+ nodevalue),])
# branch node, ignore textual descriptions
elif nodetype == 0:
- if allSubFields or fieldkey in CONSTANTS_CLASS.INCLUDE_SUBFIELDS:
+ fkMatchesRe = any(sfre.match(fieldkey) is not None for sfre in CONSTANTS_CLASS.INCLUDE_SUBFIELDS_RE) \
+ if not allSubFields else False # the if part is only to prevent unnecessary matching if not required anyway
+ if allSubFields and fieldkey not in CONSTANTS_CLASS.IGNORE_FIELDS \
+ or fkMatchesRe or fieldkey in CONSTANTS_CLASS.INCLUDE_SUBFIELDS:
subfields.extend(
- ParsedMessage._walkSubTree(nodevalue, fieldkey in CONSTANTS_CLASS.RECORD_STRUCTURE))
- elif fieldkey not in CONSTANTS_CLASS.EXCLUDE_SUB_FIELDS: # to get a notice on errors
+ ParsedMessage.walkSubTree(nodevalue, fieldkey in CONSTANTS_CLASS.RECORD_STRUCTURE))
+ # to get a notice on errors, but not if
+ # a space is contained in the key (indicates a human-readable pseudo-field) or
+ # its in EXCLUDE_SUB_FIELDS
+ elif ' ' not in fieldkey and fieldkey not in CONSTANTS_CLASS.EXCLUDE_SUB_FIELDS:
print("Ignored sub field:", fieldkey)
if fieldkey == '_ws.expert':
expertMessage = ParsedMessage._getElementByName(nodevalue, '_ws.expert.message')
@@ -1072,13 +1708,13 @@ def _walkSubTree(root: List[Tuple[str, any]], allSubFields=False) -> List[Tuple[
print('Malformed packet with unknown error.')
# apply post-hook, if any, for this field name
- if fieldkey in ParsedMessage._posthooks:
+ if fieldkey in CONSTANTS_CLASS.posthooks:
try:
- ranPostHook = ParsedMessage._posthooks[fieldkey](nodevalue, subfields)
+ ranPostHook = CONSTANTS_CLASS.posthooks[fieldkey](nodevalue, subfields)
except NotImplementedError as e:
raise NotImplementedError( "{} Field name: {}".format(e, fieldkey) )
if ranPostHook is not None:
- subfields.append(ranPostHook)
+ subfields.extend(ranPostHook)
return subfields
# for structures like irc:
@@ -1109,259 +1745,23 @@ def _walkSubTree(root: List[Tuple[str, any]], allSubFields=False) -> List[Tuple[
# "dns.qry.name_raw": "026130057477696d6703636f6d00",
# ...
-
-
- # TODO move all the hooks into the ParsingConstants class and their mapping
- # TODO enable reuse by providing the original field name to the hook
-
- # noinspection PyUnusedLocal
- @staticmethod
- def _hookAppendColon(value, siblings: List[Tuple[str, str]]) -> Tuple[str, str]:
- """
- Hook to return a colon as delimiter. See :func:`_walkSubTree()`.
-
- :param value: value of the field we are working on (str or list)
- :param siblings: subfields that we know of by now
- :return: tuple to add as new field
- """
- return 'delimiter', '3a'
-
-
- # noinspection PyUnusedLocal
- @staticmethod
- def _hookAppendSpace(value, siblings) -> Tuple[str, str]:
- """
- Hook to return a space as delimiter. See :func:`_walkSubTree()`.
-
- :param value: value of the field we are working on (str or list)
- :param siblings: subfields that we know of by now
- :type siblings: list[tuple[str, str]]
- :return: tuple to add as new field
- """
- return 'delimiter', '20'
-
-
- # noinspection PyUnusedLocal
- @staticmethod
- def _hookAppendColonSpace(value, siblings) -> Tuple[str, str]:
- """
- Hook to return a colon and a space as 2-char delimiter. See :func:`_walkSubTree()`.
-
- :param value: value of the field we are working on (str or list)
- :param siblings: subfields that we know of by now
- :type siblings: list[tuple[str, str]]
- :return: tuple to add as new field
- """
- return 'delimiter', '203a'
-
-
- # noinspection PyUnusedLocal
- @staticmethod
- def _hookIRCemptyTrailer(value: str, siblings) -> Tuple[str, str]:
- """
- The silly IRC-dissector outputs no "_raw" value if a field is empty.
- So we need to add the delimiter at least.
-
- :param value: value of the leaf node we are working on
- :param siblings: subfields that we know of by now
- :type siblings: list[tuple[str, str]]
- :return: tuple to add as new field
- """
- if len(value) == 0:
- return 'delimiter', '203a'
-
-
- # noinspection PyUnusedLocal
- @staticmethod
- def _hookAppendCRLF(value, siblings) -> Tuple[str, str]:
- """
- Hook to return a carriage returne and line feed delimiter. See :func:`_walkSubTree()`.
-
- :param value: value of the field we are working on (str or list)
- :param siblings: subfields that we know of by now
- :type siblings: list[tuple[str, str]]
- :return: tuple to add as new field
- """
- return 'delimiter', '0d0a'
-
-
- # noinspection PyUnusedLocal
- @staticmethod
- def _hookAppendNetServerEnum2(value, siblings) -> None:
- """
- Hook to fail on LANMAN's Function Code: NetServerEnum2 (104).
-
- See :func:`_walkSubTree()`.
-
- :param value: value of the field we are working on (str or list)
- :param siblings: subfields that we know of by now
- :type siblings: list[tuple[str, str]]
- :return: tuple to add as new field
- """
- if value == '104': # Function Code: NetServerEnum2 (104)
- raise NotImplementedError("LANMAN protocol's NetServerEnum2 not supported due to unparsed field at the end "
- "of each Server entry in the tshark dissector.")
- return None
-
-
- # noinspection PyUnusedLocal
- @staticmethod
- def _hookAppendThreeZeros(value, siblings) -> Tuple[str, str]:
- """
- Hook to return three zero bytes. See :func:`_walkSubTree()`.
-
- :param value: value of the field we are working on (str or list)
- :param siblings: subfields that we know of by now
- :type siblings: list[tuple[str, str]]
- :return: tuple to add as new field
- """
- return 'delimiter', '000000'
-
-
- # noinspection PyUnusedLocal
- @staticmethod
- def _hookRaiseNotImpl(value, siblings) -> Tuple[str, str]:
- """
- Hook to fail in case a dissector lacks required field information.
-
- See :func:`_walkSubTree()`.
-
- :param value: value of the field we are working on (str or list)
- :param siblings: subfields that we know of by now
- :type siblings: list[tuple[str, str]]
- :return: tuple to add as new field
- """
- raise NotImplementedError("Not supported due to unparsed field in the tshark dissector.")
-
-
- # noinspection PyUnusedLocal
- @staticmethod
- def _hookAppendFourZeros(value, siblings) -> Tuple[str, str]:
- """
- Hook to return three zero bytes. See :func:`_walkSubTree()`.
-
- :param value: value of the field we are working on (str or list)
- :param siblings: subfields that we know of by now
- :type siblings: list[tuple[str, str]]
- :return: tuple to add as new field
- """
- return 'delimiter', '00000000'
-
-
- # noinspection PyUnusedLocal
- @staticmethod
- def _hookAppendUnknownTransParams(value, siblings) -> Tuple[str, str]:
- """
- Hook to return the value of "Unknown Transaction2 Parameters". See :func:`_walkSubTree()`.
-
- :param value: value of the field we are working on (str or list)
- :param siblings: subfields that we know of by now
- :type siblings: list[tuple[str, str]]
- :return: tuple to add as new field
- """
- return 'unknownTrans2params', value
-
-
- # noinspection PyUnusedLocal
- @staticmethod
- def _hookAppendUnknownTransData(value, siblings) -> Tuple[str, str]:
- """
- Hook to return the value of "Unknown Transaction2 Parameters". See :func:`_walkSubTree()`.
-
- :param value: value of the field we are working on (str or list)
- :param siblings: subfields that we know of by now
- :type siblings: list[tuple[str, str]]
- :return: tuple to add as new field
- """
- return 'unknownTrans2data', value
-
-
-
- @staticmethod
- def _hookAppendUnknownTransReqBytes(value, siblings) -> Tuple[str, str]:
- """
- Hook to return the value of "Unknown Transaction2 Parameters". See :func:`_walkSubTree()`.
-
- :param value: value of the field we are working on (str or list)
- :param siblings: subfields that we know of by now
- :type siblings: list[tuple[str, str]]
- :return: tuple to add as new field
- """
- if value == '00' and siblings[-1] == ('smb.sc', '03'):
- return 'unknownTransReqBytes', '010001000200'
-
-
- # noinspection PyUnusedLocal
- @staticmethod
- def _hookGssapi(value, siblings) -> Tuple[str, str]:
- """
- Hook to return the value of "Unknown Transaction2 Parameters". See :func:`_walkSubTree()`.
-
- :param value: value of the field we are working on (str or list)
- :param siblings: subfields that we know of by now
- :type siblings: list[tuple[str, str]]
- :return: tuple to add as new field
- """
- return 'gss-api', value[:8]
-
-
- @staticmethod
- def _hookFirstByte(value: list, siblings: List[Tuple[str, str]]) -> Tuple[str, str]:
- """
- Hook to return the first byte of the given value for bootp.option.type
-
- :param value: hex value of the field we are working on
- :param siblings: subfields that we know of by now
- :return: tuple of field name and value to add as new field
- """
- return 'bootp.option.type', value[:2]
-
-
- # HOOKS register. See :func:`_walkSubTree()`.
- # noinspection PyUnresolvedReferences
- _prehooks = {
- 'bootp.option.type_raw': _hookFirstByte.__func__,
-
- 'irc.response.prefix_raw': _hookAppendColon.__func__,
- 'irc.response.trailer_raw': _hookAppendColonSpace.__func__,
- 'irc.response.trailer': _hookIRCemptyTrailer.__func__,
- 'irc.request.prefix_raw': _hookAppendColon.__func__,
- 'irc.request.trailer_raw': _hookAppendColonSpace.__func__,
- 'irc.request.trailer': _hookIRCemptyTrailer.__func__,
-
- 'gss-api_raw' : _hookGssapi.__func__,
- 'ntlmssp.version.ntlm_current_revision_raw' : _hookAppendThreeZeros.__func__,
- }
- ## Basic handling of missing single delimiter characters is generalized by comparing the original message to the
- ## concatenated dissector result. See :func:`_reassemblePostProcessing()
- ## within :func:`_reassemblePostProcessing()`
- # noinspection PyUnresolvedReferences
- _posthooks = {
- 'lanman.function_code' : _hookAppendNetServerEnum2.__func__,
- 'smb.dfs.referral.version' : _hookRaiseNotImpl.__func__,
- 'dcerpc.cn_num_ctx_items' : _hookAppendThreeZeros.__func__,
- 'Unknown Transaction2 Parameters' : _hookAppendUnknownTransParams.__func__,
- 'Unknown Transaction2 Data' : _hookAppendUnknownTransData.__func__,
- 'smb.reserved': _hookAppendUnknownTransReqBytes.__func__,
- 'nbns.session_data_packet_size' : _hookAppendFourZeros.__func__,
- }
-
-
@staticmethod
- def __getCompatibleConstants():
+ def __getCompatibleConstants() -> ParsingConstants:
"""
Retrieve the ParsingConstants compatible to specific versions of tshark.
- TODO Determine at which exact tshark version the JSON output format is changed.
+ TODO Determine at which exact tshark version the JSON output format is changed in each case.
- :return: Appropriate ParsingConstants class
+ :return: Appropriate ParsingConstants instance
"""
if ParsedMessage.__tshark.version <= ParsingConstants226.COMPATIBLE_TO:
- return ParsingConstants226
+ return ParsingConstants226()
elif ParsedMessage.__tshark.version <= ParsingConstants263.COMPATIBLE_TO:
- return ParsingConstants263
+ return ParsingConstants263()
+ elif ParsedMessage.__tshark.version <= ParsingConstants325.COMPATIBLE_TO:
+ return ParsingConstants325()
else:
- return ParsingConstants263
+ return ParsingConstants263()
### #############################################
@@ -1384,8 +1784,9 @@ def printUnknownTypes(self):
Prints to the console.
Example:
+ >>> # noinspection PyUnresolvedReferences
>>> from netzob.all import *
- >>> from validation.messageParser import ParsedMessage
+ >>> from nemere.validation.messageParser import ParsedMessage
>>> dhcp = PCAPImporter.readFile("../input/dhcp_SMIA2011101X_deduped-100.pcap", importLayer=1).values()
>>> pms = ParsedMessage.parseMultiple(dhcp)
>>> for parsed in pms.values(): parsed.printUnknownTypes()
@@ -1466,8 +1867,7 @@ def getValuesByName(self, fieldname):
@property
def messagetype(self):
- return MessageTypeIdentifiers.typeOfMessage(self)
-
+ return self.__getCompatibleConstants().MESSAGE_TYPE_IDS.typeOfMessage(self)
def __getstate__(self):
"""
@@ -1479,7 +1879,6 @@ def __getstate__(self):
statecopy["_ParsedMessage_CLASS___tshark"] = ParsedMessage.__tshark
return statecopy
-
def __setstate__(self, state):
"""
Include required class arribute in pickling.
diff --git a/src/nemere/validation/netzobFormatMatchScore.py b/src/nemere/validation/netzobFormatMatchScore.py
index cc99684b..3b53c9ad 100644
--- a/src/nemere/validation/netzobFormatMatchScore.py
+++ b/src/nemere/validation/netzobFormatMatchScore.py
@@ -5,16 +5,19 @@
import csv
import os
import time
-
from os.path import abspath, isdir
from typing import Dict, Tuple, List
+from concurrent.futures.process import ProcessPoolExecutor
+from concurrent.futures import TimeoutError as FutureTOError
+
from netzob import all as netzob
from netzob.Common.Utils.MatrixList import MatrixList
from netzob.Model.Vocabulary.Messages.AbstractMessage import AbstractMessage
from nemere.utils.loader import SpecimenLoader
-from nemere.validation.dissectorMatcher import FormatMatchScore, MessageComparator
+from nemere.validation.dissectorMatcher import FormatMatchScore, MessageComparator, \
+ stop_process_pool, messageparsetimeout
def printFMS(
formatmatchmetrics: Dict[Tuple[int, netzob.Symbol, List[tuple]], Tuple[int, int, int, int, int]],
@@ -82,12 +85,18 @@ def minMaxMean(formatmatchmetrics: Dict[Tuple[int, AbstractMessage], FormatMatch
"""
import numpy
+ countEmpty = 0
thrScores = dict()
for (th, msg), fms in formatmatchmetrics.items():
+ # ignore parsing errors
+ if fms.score is None:
+ countEmpty += 1
+ continue
if th not in thrScores:
thrScores[th] = list()
thrScores[th].append(fms.score)
+ print("Empty inferences ignored:", countEmpty)
return {th: (numpy.min(sc), numpy.max(sc), numpy.mean(sc)) for th, sc in thrScores.items()}
@@ -129,7 +138,7 @@ def writeReport(formatmatchmetrics: Dict[Tuple[int, AbstractMessage], FormatMatc
"""
absFolder = abspath(folder)
if not isdir(absFolder):
- raise NotADirectoryError("The reports folder {:d} is not a directory. Reports cannot be written there.".format(absFolder))
+ raise NotADirectoryError("The reports folder {} is not a directory. Reports cannot be written there.".format(absFolder))
pcapName = os.path.splitext(os.path.basename(specimens.pcapFileName))[0]
reportFolder = os.path.join(absFolder, pcapName + "_clByAlign_" + time.strftime("%Y%m%d-%H%M%S", time.localtime()))
os.makedirs(reportFolder)
@@ -164,7 +173,17 @@ def writeReport(formatmatchmetrics: Dict[Tuple[int, AbstractMessage], FormatMatc
with open(os.path.join(reportFolder, fileNameS + '.csv'), 'w') as csvfile:
symbolcsv = csv.writer(csvfile)
symbolcsv.writerow([field.name for field in symbol.fields])
- symbolcsv.writerows([[val.hex() for val in msg] for msg in symbol.getCells()])
+ # wait only messageparsetimeout seconds for Netzob's MessageParser to return the result
+ with ProcessPoolExecutor(max_workers=1) as executor:
+ try:
+ future = executor.submit(symbol.getCells)
+ cells = future.result(messageparsetimeout)
+ symbolcsv.writerows([[val.hex() for val in msg] for msg in cells])
+ except FutureTOError as e:
+ stop_process_pool(executor)
+ symbolcsv.writerow(["Parsing of symbol", symbol.name, "timed out after",
+ messageparsetimeout, "seconds. Omitting", len(symbol.messages),
+ "messages in this symbol."])
# thrSym = dict()
# for thr, sym in uniqueSymbols:
diff --git a/src/nemere/validation/protocols/__init__.py b/src/nemere/validation/protocols/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/nemere/validation/protocols/wlan.py b/src/nemere/validation/protocols/wlan.py
new file mode 100644
index 00000000..a5bc1109
--- /dev/null
+++ b/src/nemere/validation/protocols/wlan.py
@@ -0,0 +1,339 @@
+import re
+from typing import List, Tuple, Union
+
+from ..messageParser import ParsingConstants, MessageTypeIdentifiers
+
+
+class MessageTypeIdentifiers_WLAN(MessageTypeIdentifiers):
+ # wlan discriminators
+ FOR_PROTCOL = dict()
+ FOR_PROTCOL['wlan.mgt'] = [
+ 'wlan.fixed.beacon'
+ # 'wlan.fc.type_subtype' # would be correct, however we do not see the outer wlan frame
+ ]
+
+ # names for message types based on discriminator values
+ NAMED_TYPES = {
+ 'wlan.fixed.beacon' : { # all values of this fieldtype should be used as workaround for the missing outer frame
+ '6400': 'Beacon frame',
+ '6600': 'Beacon frame',
+ 'c800': 'Beacon frame',
+ }
+ # see above # 'wlan.fc.type_subtype' : { '0008' : 'Beacon frame' }
+ }
+
+class WLAN(ParsingConstants):
+ COMPATIBLE_TO = b'3.2.5'
+ MESSAGE_TYPE_IDS = MessageTypeIdentifiers_WLAN
+
+ # NOTE: in most cases, you would want to append **_raw**
+ IGNORE_FIELDS = [
+ 'wlan.tag_raw', 'wlan.tag',
+ 'wlan.tagged.all_raw', 'wlan.fixed.all_raw',
+ 'wlan.tag.vendor.oui.type_raw',
+ # 'wlan.wfa.ie.type_raw',
+ 'wps.vendor_id_raw', 'wlan.qbss.version_raw', 'wlan.tim.aid_raw',
+ 'wlan.mobility_domain.ft_capab.ft_over_ds_raw', 'wlan.mobility_domain.ft_capab.resource_req_raw',
+
+ 'wps.config_methods.usba_raw', 'wps.config_methods.ethernet_raw', 'wps.config_methods.label_raw',
+ 'wps.config_methods.display_raw', 'wps.config_methods.virt_display_raw', 'wps.config_methods.phy_display_raw',
+ 'wps.config_methods.nfcext_raw', 'wps.config_methods.nfcint_raw', 'wps.config_methods.nfcinf_raw',
+ 'wps.config_methods.pushbutton_raw', 'wps.config_methods.virt_pushbutton_raw',
+ 'wps.config_methods.phy_pushbutton_raw', 'wps.config_methods.keypad_raw',
+ 'wps.primary_device_type.category_raw', 'wps.primary_device_type.subcategory_network_infrastructure_raw',
+ 'wps.primary_device_type.subcategory_printers_scanners_faxes_copiers_raw',
+ 'wps.primary_device_type.subcategory_computer_raw', 'wps.primary_device_type.subcategory_displays_raw',
+
+ 'wlan.rsn.pcs_raw', 'wlan.rsn.pcs.list_raw', 'wlan.rsn.akms_raw', 'wlan.rsn.akms.list_raw'
+ ]
+ EXCLUDE_SUB_FIELDS = [
+ 'wlan.tag_raw', 'wlan.tag', 'wlan.ext_tag',
+
+ 'wlan.fixed.baparams_tree', 'wlan.fixed.ssc_tree', 'wlan.txbf_tree', 'wlan.asel_tree', 'wlan.extcap_tree',
+ 'wlan.ht.capabilities_tree', 'wlan.ht.ampduparam_tree', 'wlan.ht.mcsset', 'wlan.ht.info.delim1_tree',
+ 'wlan.ht.info.delim2_tree', 'wlan.ht.info.delim3_tree', 'wlan.vht.mcsset', 'wlan.rmcap_tree',
+ 'wlan.fixed.capabilities_tree', 'wlan.country_info.fnm', 'wlan.tim.bmapctl_tree',
+ 'wlan.erp_info_tree', 'wlan.htex.capabilities_tree',
+ 'wlan.rsn.gcs_tree', 'wlan.rsn.capabilities_tree',
+ 'wlan.rsn.gmcs_tree', 'wlan.20_40_bc_tree',
+ 'wlan.wfa.ie.wpa.mcs_tree', 'wlan.wfa.ie.wpa.ucs.list', 'wlan.wfa.ie.wpa.akms.list',
+ 'wlan.wfa.ie.wme.qos_info_tree', 'wlan.wfa.ie.wme.acp', 'wlan.atheros.ie.advcap.cap_tree',
+ 'wlan.hta.capabilities_tree', 'wlan.vs.ht.mcsset', 'wlan.vs.ht.capabilities_tree', 'wlan.vs.ht.ampduparam_tree',
+ 'wlan.vs.ht.mcsset', 'wlan.vs.htex.capabilities_tree', 'wlan.vs.txbf_tree', 'wlan.vs.asel_tree',
+ 'wlan.vht.op', 'wlan.vht.op.basicmcsmap_tree', 'wlan.vs.routerboard.subitem_tree',
+ ]
+ INCLUDE_SUBFIELDS = [
+ 'wlan.fixed.all', 'wlan.tagged.all', 'Fixed parameters',
+ 'Version: 0x10', 'UUID E', 'WFD Device Information',
+ 'Vendor Extension', 'Request Type: Enrollee, Info only (0x00)', 'Primary Device Type',
+ 'Association State: Not associated (0x0000)', 'Response Type: AP (0x03)',
+ 'Configuration Error: No Error (0x0000)', # 'wlan.ext_tag', 'wlan.ext_tag.he_mac_caps_tree'
+ 'wlan.rsn.akms.list', 'wlan.rsn.akms_tree', 'wlan.rsn.pcs.list', 'wlan.rsn.pcs_tree'
+ ]
+ # 'RF Bands: 2.4 GHz (0x01)', 'RF Bands: 2.4 and 5 GHz (0x03)',
+ # 'Config Methods: 0x3108', 'Config Methods: 0x4388'
+ # 'Manufacturer: ', 'Model Name: ', 'Model Number: ', 'Device Name: ',
+ # 'Manufacturer: Celeno Communication, Inc.', 'Model Name: Celeno Wireless AP 2.4G', 'Model Number: CL1800',
+ # 'Device Name: CelenoAP2.4G', 'Serial Number: 12345678', 'Device Password ID: PIN (default) (0x0000)',
+ # 'P2P Capability: Device 0x4 Group 0x1', 'P2P Device ID: b2:5a:da:23:c9:fd', 'Ap Setup Locked: 0x01',
+ # 'Selected Registrar: 0x00',
+ # 'Wifi Protected Setup State: Not configured (0x01)', 'Wifi Protected Setup State: Configured (0x02)',
+ INCLUDE_SUBFIELDS_RE = [ re.compile(pattern) for pattern in [
+ 'Manufacturer: .*', 'Model Name: .*', 'Model Number: .*', 'Device Name: .*',
+ 'RF Bands: .*', 'Config Methods: .*', 'Serial Number: .*', 'Device Password ID: .*',
+ 'P2P Capability: .*', 'P2P Device ID: .*', 'Ap Setup Locked: .*', 'Selected Registrar: .*',
+ 'Wifi Protected Setup State: .*'
+ ]]
+ # names of field nodes in the json that have a record structure (list[list[tuples], not list[tuples[str, tuple]]).
+ RECORD_STRUCTURE = [ ]
+
+ # mapping of field names to general value types.
+ TYPELOOKUP = dict()
+ """:type: Dict[str, str]"""
+
+ TYPELOOKUP['wlan.fixed.timestamp'] = 'timestamp_le' # has value: 34f23e7b3c000000
+ TYPELOOKUP['wlan.fixed.beacon'] = 'int_le' # has value: 6400
+ TYPELOOKUP['wlan.fixed.capabilities'] = 'flags' # has value: 1104
+ TYPELOOKUP['wlan.fixed.action_code'] = 'enum' # has value: 00
+ TYPELOOKUP['wlan.fixed.dialog_token'] = 'int' # has value: 5c
+ TYPELOOKUP['wlan.fixed.baparams'] = 'flags' # has value: 0310
+ TYPELOOKUP['wlan.fixed.batimeout'] = 'int_le' # has value: 0000
+ TYPELOOKUP['wlan.fixed.ssc'] = 'int_le' # has value: b0d1
+
+ TYPELOOKUP['wlan.tag.number'] = 'enum' # has value: dd
+ TYPELOOKUP['wlan.tag.length'] = 'int_le' # has value: ff
+ TYPELOOKUP['wlan.tag.vendor.oui.type'] = 'enum' # has value: 0b
+
+ TYPELOOKUP['wlan.ssid'] = 'chars' # has value: 465249545a21426f7820574c414e2033313730
+ TYPELOOKUP['wlan.supported_rates'] = 'int' # has value: 82
+ TYPELOOKUP['wlan.extended_supported_rates'] = 'int' # has value: 30
+
+ TYPELOOKUP['wlan.ht.capabilities'] = 'flags' # has value: ad01
+ TYPELOOKUP['wlan.ht.ampduparam'] = 'flags' # has value: 17
+ TYPELOOKUP['wlan.ht.mcsset'] = 'flags' # has value: ffffff00000000000000000000000000
+ TYPELOOKUP['wlan.ht.info.primarychannel'] = 'int' # has value: 01
+ TYPELOOKUP['wlan.ht.info.delim1'] = 'flags' # has value: 08
+ TYPELOOKUP['wlan.ht.info.delim2'] = 'flags' # has value: 1500
+ TYPELOOKUP['wlan.ht.info.delim3'] = 'flags' # has value: 0000
+ TYPELOOKUP['wlan.vht.op'] = 'flags' # has value: 000100
+ TYPELOOKUP['wlan.vht.op.basicmcsmap'] = 'enum' # has value: fcff
+ TYPELOOKUP['wlan.vht.capabilities'] = 'flags' # has value: 32008003
+ # TODO actually this is a tree with tag.number = 17 (VHT Capabilities) containing type, length, and a series of flags
+ TYPELOOKUP['wlan.vht.mcsset'] = 'flags' # has value: faff0000faff0000
+ TYPELOOKUP['wlan.hta.control_channel'] = 'enum' # has value: 01
+ TYPELOOKUP['wlan.hta.capabilities'] = 'flags' # has value: 00
+ TYPELOOKUP['wlan.htex.capabilities'] = 'flags' # has value: 0000
+ TYPELOOKUP['wlan.txbf'] = 'flags' # has value: 00000000
+ TYPELOOKUP['wlan.asel'] = 'flags' # has value: 00
+
+ TYPELOOKUP['wlan.ds.current_channel'] = 'enum' # has value: 01
+ TYPELOOKUP['wlan.tim.dtim_count'] = 'int' # has value: 02
+ TYPELOOKUP['wlan.tim.dtim_period'] = 'int' # has value: 03
+ TYPELOOKUP['wlan.tim.bmapctl'] = 'flags' # has value: 00
+ TYPELOOKUP['wlan.tim.partial_virtual_bitmap'] = 'flags' # has value: 000000000000000000
+ TYPELOOKUP['wlan.erp_info'] = 'flags' # has value: 04
+ TYPELOOKUP['wlan.rsn.version'] = 'int_le' # has value: 0100
+ TYPELOOKUP['wlan.rsn.gcs'] = 'addr' # has value: 000fac02 # actually is is addr + enum
+ TYPELOOKUP['wlan.rsn.pcs.count'] = 'int_le' # has value: 0100
+ TYPELOOKUP['wlan.rsn.pcs.list'] = 'unknown' # has value: 000fac04
+ TYPELOOKUP['wlan.rsn.akms.count'] = 'int_le' # has value: 0100
+ TYPELOOKUP['wlan.rsn.akms.list'] = 'unknown' # has value: 000fac02
+ TYPELOOKUP['wlan.rsn.pcs.oui'] = 'addr' # has value: 000fac
+ TYPELOOKUP['wlan.rsn.pcs.type'] = 'enum' # has value: 04
+ TYPELOOKUP['wlan.rsn.akms.oui'] = 'addr' # has value: 000fac
+ TYPELOOKUP['wlan.rsn.akms.type'] = 'enum' # has value: 02
+ TYPELOOKUP['wlan.rsn.capabilities'] = 'flags' # has value: 0000
+ TYPELOOKUP['wlan.rsn.pmkid.count'] = 'int_le' # has value: 0000
+ TYPELOOKUP['wlan.rsn.gmcs'] = 'addr' # has value: 000fac06 # actually is is addr + enum
+ TYPELOOKUP['wlan.20_40_bc'] = 'flags' # has value: 00
+
+ TYPELOOKUP['wlan.wfa.ie.wpa.version'] = 'int_le' # has value: 0100
+ TYPELOOKUP['wlan.wfa.ie.wpa.mcs'] = 'addr' # has value: 0050f202
+ TYPELOOKUP['wlan.wfa.ie.wpa.ucs.count'] = 'int_le' # has value: 0100
+ TYPELOOKUP['wlan.wfa.ie.wpa.ucs.list'] = 'addr' # has value: 0050f202
+ TYPELOOKUP['wlan.wfa.ie.wpa.akms.count'] = 'int_le' # has value: 0100
+ TYPELOOKUP['wlan.wfa.ie.wpa.akms.list'] = 'addr' # has value: 0050f202
+ TYPELOOKUP['wlan.wfa.ie.type'] = 'enum' # has value: 02
+ TYPELOOKUP['wlan.wfa.ie.wme.subtype'] = 'enum' # has value: 01
+ TYPELOOKUP['wlan.wfa.ie.wme.version'] = 'enum' # has value: 01
+ TYPELOOKUP['wlan.wfa.ie.wme.qos_info'] = 'flags' # has value: 80
+ TYPELOOKUP['wlan.wfa.ie.wme.reserved'] = 'enum' # has value: 00
+ TYPELOOKUP['wlan.wfa.ie.wme.acp'] = 'flags' # has value: 03a40000
+
+ TYPELOOKUP['wlan.qbss.version'] = 'enum'
+ TYPELOOKUP['wlan.qbss.scount'] = 'int_le' # has value: 0000
+ TYPELOOKUP['wlan.qbss.cu'] = 'int' # has value: 3b
+ TYPELOOKUP['wlan.qbss.adc'] = 'int_le' # has value: 0000
+ TYPELOOKUP['wlan.obss.spd'] = 'int_le' # has value: 1400
+ TYPELOOKUP['wlan.obss.sad'] = 'int_le' # has value: 0a00
+ TYPELOOKUP['wlan.obss.cwtsi'] = 'int_le' # has value: 2c01
+ TYPELOOKUP['wlan.obss.sptpc'] = 'int_le' # has value: c800
+ TYPELOOKUP['wlan.obss.satpc'] = 'int_le' # has value: 1400
+ TYPELOOKUP['wlan.obss.wctdf'] = 'int_le' # has value: 0500
+ TYPELOOKUP['wlan.obss.sat'] = 'int_le' # has value: 1900
+ TYPELOOKUP['wlan.extcap'] = 'flags' # has value: 05
+ TYPELOOKUP['wlan.rmcap'] = 'flags' # has value: 72
+ TYPELOOKUP['wlan.country_info.code'] = 'chars' # has value: 4e4c
+ TYPELOOKUP['wlan.country_info.environment'] = 'enum' # has value: 20
+ TYPELOOKUP['wlan.country_info.fnm'] = 'enum' # has value: 010d14
+ TYPELOOKUP['wlan.ap_channel_report.channel_list'] = 'int' # has value: 01
+ TYPELOOKUP['wlan.ap_channel_report.operating_class'] = 'enum' # has value: 21
+ TYPELOOKUP['wlan.supopeclass.current'] = 'enum' # has value: 51
+
+ TYPELOOKUP['wps.type'] = 'enum' # has value: 1044
+ TYPELOOKUP['wps.length'] = 'int_le' # has value: 0001
+ TYPELOOKUP['wps.version'] = 'enum' # has value: 10
+ TYPELOOKUP['wps.wifi_protected_setup_state'] = 'enum' # has value: 02
+ TYPELOOKUP['wps.uuid_e'] = 'addr' # has value: e7a17c8b6184cf40054d6178ec45fb8e
+ TYPELOOKUP['wps.rf_bands'] = 'enum' # has value: 03
+ TYPELOOKUP['wps.response_type'] = 'enum' # has value: 03
+ TYPELOOKUP['wps.manufacturer'] = 'chars' # has value: 42726f6164636f6d
+ TYPELOOKUP['wps.model_name'] = 'chars' # has value: 42726f6164636f6d
+ TYPELOOKUP['wps.model_number'] = 'chars' # has value: 313233343536
+ TYPELOOKUP['wps.serial_number'] = 'chars' # has value: 31323334
+ TYPELOOKUP['wps.primary_device_type'] = 'enum' # has value: 00060050f2040001
+ TYPELOOKUP['wps.device_name'] = 'chars' # has value: 42726f6164636f6d4150
+ TYPELOOKUP['wps.config_methods'] = 'flags' # has value: 0000
+ TYPELOOKUP['wps.request_type'] = 'enum' # has value: 00
+ TYPELOOKUP['wps.association_state'] = 'enum' # has value: 0000
+ TYPELOOKUP['wps.configuration_error'] = 'enum' # has value: 0000
+ TYPELOOKUP['wps.device_password_id'] = 'enum' # has value: 0000
+ TYPELOOKUP['wps.ap_setup_locked'] = 'enum' # has value: 01
+ TYPELOOKUP['wps.selected_registrar'] = 'enum' # has value: 00
+
+ TYPELOOKUP['wlan.atheros.ie.type'] = 'enum' # has value: 01
+ TYPELOOKUP['wlan.atheros.ie.subtype'] = 'enum' # has value: 01
+ TYPELOOKUP['wlan.atheros.ie.version'] = 'enum' # has value: 00
+ TYPELOOKUP['wlan.atheros.ie.advcap.cap'] = 'flags' # has value: 00
+ TYPELOOKUP['wlan.atheros.ie.advcap.defkey'] = 'enum' # has value: ff7f
+ TYPELOOKUP['wlan.atheros.data'] = 'bytes' # has value: 08000a00
+
+ TYPELOOKUP['wlan.powercon.local'] = 'int' # has value: 00
+ TYPELOOKUP['wlan.tcprep.trsmt_pow'] = 'int' # has value: 14
+ TYPELOOKUP['wlan.tcprep.link_mrg'] = 'int' # has value: 00
+ TYPELOOKUP['wlan.ext_bss.mu_mimo_capable_sta_count'] = 'int_le' # has value: 0000
+ TYPELOOKUP['wlan.ext_bss.ss_underutilization'] = 'int' # has value: f3
+ TYPELOOKUP['wlan.ext_bss.observable_sec_20mhz_utilization'] = 'int' # has value: 8b
+ TYPELOOKUP['wlan.ext_bss.observable_sec_40mhz_utilization'] = 'int' # has value: 00
+ TYPELOOKUP['wlan.ext_bss.observable_sec_80mhz_utilization'] = 'int' # has value: 00
+
+ TYPELOOKUP['wlan.tag.symbol_proprietary.oui'] = 'addr' # has value: 00a0f8
+ TYPELOOKUP['wlan.tag.symbol_proprietary.extreme.assoc_clients'] = 'int_le' # has value: 0000
+ TYPELOOKUP['wlan.tag.symbol_proprietary.extreme.load_kbps'] = 'int_le' # has value: 0a00
+ TYPELOOKUP['wlan.tag.symbol_proprietary.extreme.load_pps'] = 'int_le' # has value: 3000
+ TYPELOOKUP['wlan.tag.symbol_proprietary.extreme.client_txpower'] = 'int_le' # has value: 0000
+ TYPELOOKUP['wlan.tag.symbol_proprietary.extreme.timestamp'] = 'timestamp_le' # has value: 2ef82f60
+
+ TYPELOOKUP['wlan.vs.ht.mcsset'] = 'flags' # has value: 00000000000000000000000000000000
+ TYPELOOKUP['wlan.vs.ht.capabilities'] = 'flags' # has value: ee19
+ TYPELOOKUP['wlan.vs.ht.ampduparam'] = 'flags' # has value: 1f
+ TYPELOOKUP['wlan.vs.htex.capabilities'] = 'flags' # has value: 0000
+ TYPELOOKUP['wlan.vs.txbf'] = 'flags' # has value: 00000000
+ TYPELOOKUP['wlan.vs.asel'] = 'flags' # has value: 00
+ TYPELOOKUP['wlan.vs.pren.type'] = 'enum' # has value: 04
+ TYPELOOKUP['wlan.vs.pren.unknown_data'] = 'unknown' # has value: 08bf0cb279ab03aaff0000aaff0000c005000100fcff
+ TYPELOOKUP['wlan.vs.extreme.subtype'] = 'enum' # has value: 03
+ TYPELOOKUP['wlan.vs.extreme.subdata'] = 'bytes' # has value: 00010000000000004871adb700140001949b2c0d883f00010c01
+ TYPELOOKUP['wlan.vs.routerboard.unknown'] = 'unknown' # has value: 0000
+ TYPELOOKUP['wlan.vs.routerboard.subitem'] = 'bytes'
+ # has value: 011e001000000066190600004534384438433932384337440000000000000000
+ TYPELOOKUP['wlan.vs.routerboard.subitem'] = 'bytes' # has value: 05026c09
+ TYPELOOKUP['wlan.vs.aruba.subtype'] = 'enum' # has value: 04
+ TYPELOOKUP['wlan.vs.aruba.data'] = 'bytes' # has value: 0809
+
+ TYPELOOKUP['wlan.mobility_domain.mdid'] = 'addr' # has value: 38de
+ TYPELOOKUP['wlan.mobility_domain.ft_capab'] = 'flags' # has value: 01
+ TYPELOOKUP['wlan.cisco.ccx1.unknown'] = 'unknown' # has value: 05008f000f00ff035900
+ TYPELOOKUP['wlan.cisco.ccx1.name'] = 'chars' # has value: 41502d46494c2d303832360000000000
+ TYPELOOKUP['wlan.cisco.ccx1.clients'] = 'int' # has value: 00
+ TYPELOOKUP['wlan.cisco.ccx1.unknown2'] = 'unknown' # has value: 00002d
+ TYPELOOKUP['wlan.aironet.type'] = 'enum' # has value: 00
+ TYPELOOKUP['wlan.aironet.dtpc'] = 'int' # has value: 11
+ TYPELOOKUP['wlan.aironet.dtpc_unknown'] = 'unknown' # has value: 00
+ TYPELOOKUP['wlan.aironet.data'] = 'bytes' # has value: 0104
+ TYPELOOKUP['wlan.aironet.version'] = 'int' # has value: 05
+ TYPELOOKUP['wlan.aironet.clientmfp'] = 'enum' # has value: 01
+
+ # TODO ext_tag actually is a complex substructure seldom seen
+ TYPELOOKUP['wlan.ext_tag'] = 'unknown' # has value: 23010808180080203002000d009f08000000fdfffdff391cc7711c07
+ TYPELOOKUP['wlan.tag_raw'] = 'unknown' # has value: dd080050f20800120010
+ TYPELOOKUP['wps.vendor_extension'] = 'unknown' # has value: 00372a000120
+ TYPELOOKUP['wlan.tag.vendor.data'] = 'unknown' # has value: 0200101c0000
+ TYPELOOKUP['padding'] = 'pad' # has value: 00
+
+ # noinspection PyUnusedLocal
+ @staticmethod
+ def _hookUnknownOUItag(value: list, siblings: List[Tuple[str, str]]) -> Union[List[Tuple[str, str]], None]:
+ """
+ Hook to parse wlan.tag excluding some dumb vendor extensions that are not contained completely in the subfields.
+ These are with 'Tag Number: Vendor Specific (221)' ('wlan.tag.number'):
+ * 'Type: Unknown (0x08)': the Unknown OUI type 0x08 of Microsoft in an wlan.wfa.ie.type.
+ * 'Vendor Specific OUI Type: 22' of the Wi-Fi Alliance
+ * 'Advertisement Protocol element: ANQP' (tag number 108 == 0x6c)
+ * wlan.interworking: tag number '107' == '6b'
+ * wlan.supopeclass: tag number '59' == '3b'
+ * 'WFD Device Information': tag number '10' == '0a'
+
+ :param value: hex value of the field we are working on
+ :param siblings: subfields that we know of by now
+ :return: tuple of field name and value to add as new field
+ """
+ from ..messageParser import ParsedMessage
+
+ # retrieve the tag type ("number"), we are interested only in 'Tag Number: Vendor Specific (221)'
+ tagnumbers = [tag[1] for tag in value if tag[0] == 'wlan.tag.number']
+
+ if len(tagnumbers) == 1 and tagnumbers[0] in ['108', '107', '59']:
+ return None
+
+ if len(tagnumbers) == 1 and tagnumbers[0] in ['221']:
+ ietypenumbers = [ietype[1] for ietype in value if ietype[0] == 'wlan.tag.vendor.oui.type']
+ if len(ietypenumbers) == 1 and ietypenumbers[0] in ['8', '9', '10', '22']:
+ return None
+
+ return ParsedMessage.walkSubTree(value)
+
+ # noinspection PyUnusedLocal
+ @staticmethod
+ def _hookUnknownOUIraw(value: str, siblings: List[Tuple[str, str]]) -> Union[List[Tuple[str, str]], None]:
+ """
+ Hook to parse the Unknown OUI type 0x08 of Microsoft in an wlan.wfa.ie.type.
+ Vendor Specific OUI Type: 22
+
+ :param value: hex value of the field we are working on
+ :param siblings: subfields of our common parent that we know of by now
+ :return: tuple of field name and value to add as new field
+ """
+ # 🢇 .tag.number = 221 🢇 .tag.vendor.oui.type 🢇 wlan.tag.number
+ if value[0:2] == 'dd' and value[10:12] in ['08', '09', '0a', '16'] or value[0:2] in ['6c', '6b', '3b']:
+ return [('wlan.tag_raw', value)]
+ return None
+
+ # noinspection PyUnusedLocal
+ @staticmethod
+ def _hookSupopeclass(value: str, siblings: List[Tuple[str, str]]) -> Union[List[Tuple[str, str]], None]:
+ return [('padding', '00')]
+
+ # noinspection PyUnusedLocal
+ @staticmethod
+ def _hookExtTag(value: str, siblings: List[Tuple[str, str]]) -> Union[List[Tuple[str, str]], None]:
+ # retrieve the tag type ("number"), we are interested only in 'Tag Number: Element ID Extension (255)'
+ tagnumbers = [tag[1][0] for tag in value if tag[0] == 'wlan.tag.number_raw']
+ # and its length
+ taglengths = [tag[1][0] for tag in value if tag[0] == 'wlan.ext_tag.length_raw']
+ if len(tagnumbers) == 1 and tagnumbers[0][0] in ['ff'] and len(taglengths) == 1:
+ return [('wlan.tag.number', tagnumbers[0]), ('wlan.ext_tag.length', taglengths[0])]
+ return None
+
+ prehooks = dict()
+ # noinspection PyUnresolvedReferences
+ prehooks['wlan.ext_tag'] = _hookExtTag.__func__
+
+ posthooks = dict()
+ # noinspection PyUnresolvedReferences
+ posthooks['wlan.tag'] = _hookUnknownOUItag.__func__
+ # noinspection PyUnresolvedReferences
+ posthooks['wlan.tag_raw'] = _hookUnknownOUIraw.__func__
+ # noinspection PyUnresolvedReferences
+ posthooks['wlan.supopeclass.current_raw'] = _hookSupopeclass.__func__
+
diff --git a/src/nemere/validation/reportWriter.py b/src/nemere/validation/reportWriter.py
deleted file mode 100644
index 4e9f5886..00000000
--- a/src/nemere/validation/reportWriter.py
+++ /dev/null
@@ -1,138 +0,0 @@
-"""
-Write Format Match Score report for a list of analysed messages.
-"""
-
-import os
-import time
-import csv
-import numpy
-from typing import Dict, Tuple, Iterable
-from os.path import abspath, isdir, splitext, basename, join
-from itertools import chain
-
-from netzob.Model.Vocabulary.Messages.AbstractMessage import AbstractMessage
-
-from nemere.utils.loader import SpecimenLoader
-from nemere.validation.dissectorMatcher import FormatMatchScore, MessageComparator
-
-
-def calcScoreStats(scores: Iterable[float]) -> Tuple[float, float, float, float, float]:
- """
- :param scores: An Iterable of FMS values.
- :return: min, meankey, max, mediankey, standard deviation of the scores,
- where meankey is the value in scores closest to the mean of its values,
- and median is the value in scores closest to the mean of its values.
- """
- scores = sorted(scores)
- fmsmin, fmsmean, fmsmax, fmsmedian, fmsstd = \
- numpy.min(scores), numpy.mean(scores), numpy.max(scores), numpy.median(scores), numpy.std(scores)
- # get quality key closest to mean
- fmsmeankey = 1
- if len(scores) > 2:
- for b,a in zip(scores[:-1], scores[1:]):
- if a < fmsmean:
- continue
- fmsmeankey = b if fmsmean - b < a - fmsmean else a
- break
- return float(fmsmin), float(fmsmeankey), float(fmsmax), float(fmsmedian), float(fmsstd)
-
-
-def getMinMeanMaxFMS(scores: Iterable[float]) -> Tuple[float, float, float]:
- """
- :param scores: An Iterable of FMS values.
- :return: min, meankey, and max of the scores,
- where meankey is the value in scores closest to the means of its values.
- """
- return calcScoreStats(scores)[:3]
-
-
-def countMatches(quality: Iterable[FormatMatchScore]):
- """
- :param quality: List of FormatMatchScores
- :return: count of exact matches, off-by-one near matches, off-by-more-than-one matches
- """
- exactcount = 0
- offbyonecount = 0
- offbymorecount = 0
- for fms in quality: # type: FormatMatchScore
- exactcount += fms.exactCount
- offbyonecount += sum(1 for truf, inff in fms.nearMatches.items() if abs(truf - inff) == 1)
- offbymorecount += sum(1 for truf, inff in fms.nearMatches.items() if abs(truf - inff) > 1)
- return exactcount, offbyonecount, offbymorecount
-
-
-def writeReport(formatmatchmetrics: Dict[AbstractMessage, FormatMatchScore],
- runtime: float,
- specimens: SpecimenLoader, comparator: MessageComparator,
- inferenceTitle: str, folder="reports"):
-
- absFolder = abspath(folder)
- if not isdir(absFolder):
- raise NotADirectoryError("The reports folder {} is not a directory. Reports cannot be written there.".format(
- absFolder))
- pcapName = splitext(basename(specimens.pcapFileName))[0]
- reportFolder = join(absFolder, pcapName + "_{}_{}".format(
- inferenceTitle, time.strftime("%Y%m%d-%H%M%S", time.localtime())))
- os.makedirs(reportFolder)
-
- print('Write report to ' + reportFolder)
-
- # write Format Match Score and Metrics to csv
- with open(os.path.join(reportFolder, 'FormatMatchMetrics.csv'), 'w') as csvfile:
- fmmcsv = csv.writer(csvfile)
- fmmcsv.writerow(["Message", "Score", 'I', 'M', 'N', 'S', 'MG', 'SP'])
- fmmcsv.writerows( [
- (message.data.hex(), fms.score,
- fms.inferredCount, fms.exactCount, fms.nearCount, fms.specificy, fms.matchGain, fms.specificyPenalty)
- for message, fms in formatmatchmetrics.items()] )
-
- scoreStats = calcScoreStats([q.score for q in formatmatchmetrics.values()])
- matchCounts = countMatches(formatmatchmetrics.values())
-
- with open(os.path.join(reportFolder, 'ScoreStatistics.csv'), 'w') as csvfile:
- fmmcsv = csv.writer(csvfile)
- fmmcsv.writerow(["inference", "min", "mean", "max", "median", "std",
- "exactcount", "offbyonecount", "offbymorecount", "runtime"])
- fmmcsv.writerow( [ inferenceTitle,
- *scoreStats, *matchCounts,
- runtime] )
-
- # write Symbols to csvs
- multipleSymbolCSVs = False
- if multipleSymbolCSVs:
- for cnt, symbol in enumerate( # by the set comprehension,
- { quality.symbol # remove identical symbols due to multiple formats
- for quality
- in formatmatchmetrics.values() } ):
- fileNameS = 'Symbol_{:s}_{:d}'.format(symbol.name, cnt)
- with open(os.path.join(reportFolder, fileNameS + '.csv'), 'w') as csvfile:
- symbolcsv = csv.writer(csvfile)
- symbolcsv.writerow([field.name for field in symbol.fields])
- symbolcsv.writerows([val.hex() for val in msg] for msg in symbol.getCells())
- else:
- fileNameS = 'Symbols'
- with open(os.path.join(reportFolder, fileNameS + '.csv'), 'w') as csvfile:
- symbolcsv = csv.writer(csvfile)
- msgcells = chain.from_iterable([sym.getCells() for sym in # unique symbols by set
- {fms.symbol for fms in formatmatchmetrics.values()}])
- symbolcsv.writerows(
- [val.hex() for val in msg] for msg in msgcells
- )
-
- # # write tshark-dissection to csv
- # # currently only unique formats. For a specific trace a baseline could be determined
- # # by a one time run of per ParsedMessage
- # with open(os.path.join(reportFolder, 'tshark-dissections.csv'), 'w') as csvfile:
- # formatscsv = csv.writer(csvfile)
- # revmsg = {l2m: l5m for l5m, l2m in specimens.messagePool.items()} # get L5 messages for the L2 in tformats
- # formatscsv.writerows([(revmsg[l2m].data.hex(), f) for l2m, f in tformats.items()])
-
-
- # FMS : Symbol
- score2symbol = {fms.score: fms.symbol for fms in formatmatchmetrics.values()}
-
- tikzcode = comparator.tprintInterleaved(score2symbol[mmm] for mmm in scoreStats[:3])
-
- # write Format Match Score and Metrics to csv
- with open(join(reportFolder, 'example-inference-minmeanmax.tikz'), 'w') as tikzfile:
- tikzfile.write(tikzcode)
\ No newline at end of file
diff --git a/src/nemere/validation/tsharkConnector.py b/src/nemere/validation/tsharkConnector.py
index 3e19389b..fe9b42df 100644
--- a/src/nemere/validation/tsharkConnector.py
+++ b/src/nemere/validation/tsharkConnector.py
@@ -1,7 +1,8 @@
import subprocess, io, struct, time
from queue import Queue
from tempfile import NamedTemporaryFile
-from typing import Dict
+from typing import Dict, Union
+
class TsharkConnector(object):
"""
@@ -35,10 +36,10 @@ class TsharkConnector(object):
def __init__(self, linktype : int):
self.__linktype = linktype
- self.__tshark = None # type: subprocess.Popen
+ self.__tshark = None # type: Union[subprocess.Popen, None]
self.__tsharkqueue = Queue()
- self.__tempfile = None # type: io.BufferedRandom
- self.__tempreader = None # type: io.BufferedReader
+ self.__tempfile = None # type: Union[io.BufferedRandom, None]
+ self.__tempreader = None # type: Union[io.BufferedReader, None]
self.__version = None
@@ -129,12 +130,15 @@ def readPacket(self):
readThread = threading.Thread(target=TsharkConnector.__readlines, args=(self.__tempreader, self.__tsharkqueue))
readThread.start()
# print("Wait for queue...")
+ # Wait for queue to fill from the tshark-pipe
for timeout in range(20):
if self.__tsharkqueue.empty():
time.sleep(.01)
+ # print("Wait a little...")
else:
break
- readThread.join(2.0)
+ print("Wait for tshark output (max 20s)...")
+ readThread.join(20.0)
if readThread.is_alive() or self.__tsharkqueue.empty():
raise TimeoutError("tshark timed out with no result.")
@@ -238,8 +242,8 @@ def checkTsharkCompatibility():
raise Exception('ERROR: The installed tshark does not support JSON output, which is required for '
'dissection parsing. Found tshark version {}. '
'Upgrade!\”'.format(versionlist[2].decode()))
- if versionlist[2] not in (b'2.2.6', b'2.6.3', b'2.6.5', b'2.6.8'):
- print("WARNING: Unchecked version {} of tshark in use! Dissections may be misfunctioning of faulty. "
+ if versionlist[2] not in (b'2.2.6', b'2.6.3', b'2.6.5', b'2.6.8', b'3.2.3', b'3.2.5'):
+ print("WARNING: Unchecked version {} of tshark in use! Dissections may be misfunctioning or faulty. "
"Check compatibility of JSON output!\n".format(versionlist[2].decode()))
return versionlist[2], False
return versionlist[2], True
diff --git a/src/nemere/visualization/distancesPlotter.py b/src/nemere/visualization/distancesPlotter.py
index 4c581b3f..4b98db81 100644
--- a/src/nemere/visualization/distancesPlotter.py
+++ b/src/nemere/visualization/distancesPlotter.py
@@ -2,8 +2,10 @@
import matplotlib.pyplot as plt
from matplotlib import cm, colors
-from typing import List, Any, Union
+from typing import List, Any, Union, Sequence
from itertools import compress
+from sklearn import manifold
+from sklearn.decomposition import PCA
from netzob.Model.Vocabulary.Messages.RawMessage import RawMessage
@@ -20,25 +22,130 @@ class DistancesPlotter(MessagePlotter):
"""
def __init__(self, specimens: BaseLoader, analysisTitle: str,
- isInteractive: bool=False):
+ isInteractive: bool=False, plotSegmentValues=False):
super().__init__(specimens, analysisTitle, isInteractive)
+ self._autoLegend = False
+ # # plot configuration
plt.rc('xtick', labelsize=10) # fontsize of the tick labels
plt.rc('ytick', labelsize=10) # fontsize of the tick labels
self._fig, self._axes = plt.subplots(1,2, figsize=(10,5)) # type: plt.Figure, numpy.ndarray
if not isinstance(self._axes, numpy.ndarray):
self._axes = numpy.array(self._axes)
- self._fig.set_size_inches(16, 9)
- # self._cm = cm.Set1 # has 9 colors
- # self._cm = cm.tab20 # 20 colors
+ self._fig.set_size_inches(16, 8)
+ # self.cm = cm.Set1 # has 9 colors
+ # self.cm = cm.tab20 # 20 colors
# noinspection PyUnresolvedReferences
- self._cm = cm.jet # type: colors.LinearSegmentedColormap
+ self.cm = cm.jet # type: colors.LinearSegmentedColormap
+ """label color map"""
+ # noinspection PyUnresolvedReferences
+ self.fcm = cm.cubehelix
+ """type color map"""
+ self.labsize = 150
+ """label markers: size factor"""
+ self.typsize = 30
+ """type markers: size factor"""
+ self.maxSamples = 1000
+ """subsample the elements to plot if they are above this threshold"""
+ self._plotSegmentValues = plotSegmentValues
+
+ @property
+ def axesFlat(self):
+ return self._axes.flat
+
+ def subsample(self,
+ segments: List[Union[MessageSegment, TypedSegment, TypedTemplate, Template, RawMessage, Any]],
+ distances: numpy.ndarray, labels: numpy.ndarray):
+ """
+ subsample the elements to plot if they are above the maxSamples threshold.
+
+ :param segments: The original segments, messages or other elements to be plotted.
+ :param distances: The pairwise distances between all of the original segments
+ :param labels: The labels for each of the original segments
+ :return: if subsampling was necessary, a tuple of
+ (originalSegmentCount, and subsampled values for segments, distances, labels),
+ else: False
+ """
+ originalSegmentCount = len(segments)
+ if originalSegmentCount > 2 * self.maxSamples:
+ import math
+ ratiorev = originalSegmentCount / self.maxSamples
+ step2keep = math.floor(ratiorev)
+ lab2idx = dict()
+ for idx, lab in enumerate(labels):
+ if lab not in lab2idx:
+ lab2idx[lab] = list()
+ lab2idx[lab].append(idx)
+ # copy list to remove elements without side-effects
+ segments = segments.copy()
+ # to save the indices to be removed
+ idx2rem = list()
+ # determines a subset evenly distributed over all clusters while honoring the ratio to reduce to.
+ for lab, ics in lab2idx.items():
+ keep = set(ics[::step2keep])
+ idx2rem.extend(set(ics) - keep)
+ idx2rem = sorted(idx2rem, reverse=True)
+ for idx in idx2rem:
+ del segments[idx]
+ labels = numpy.delete(labels, idx2rem, 0)
+ distances = numpy.delete(numpy.delete(distances, idx2rem, 0), idx2rem, 1)
+ return originalSegmentCount, segments, distances, labels
+ else:
+ return False
+ @staticmethod
+ def manifoldPositions(distances: numpy.ndarray):
+ """prepare the 2 dimensionally projected positions for the input"""
+ # prepare MDS
+ seed = numpy.random.RandomState(seed=3)
+ mds = manifold.MDS(n_components=2, max_iter=3000, eps=1e-9, random_state=seed,
+ dissimilarity="precomputed", n_jobs=1)
+ pos = mds.fit(distances).embedding_
+ # Rotate the data
+ clf = PCA(n_components=2)
+ return clf.fit_transform(pos)
+
+ @staticmethod
+ def uniqueLabels(labels: numpy.ndarray,
+ segments: List[Union[MessageSegment, TypedSegment, TypedTemplate, Template, RawMessage, Any]]) \
+ -> List:
+ """identify unique labels"""
+ allabels = set(labels)
+ if None in allabels:
+ allabels.remove(None)
+ if False in allabels:
+ allabels.remove(False)
+ if all(isinstance(l, numpy.integer) or l.isdigit() for l in allabels if l != "Noise"):
+ ulab = sorted(allabels,
+ key=lambda l: -1 if l == "Noise" else int(l))
+ else:
+ ulab = sorted(allabels)
- def plotManifoldDistances(self, segments: List[Union[MessageSegment, TypedSegment, TypedTemplate, Template, RawMessage, Any]],
+ # omit noise in cluster labels if types are plotted anyway.
+ # the different handling is necessary due to the different noise markers in segments and messages.
+ if any(isinstance(seg, (TypedSegment, TypedTemplate)) for seg in segments):
+ for l in ulab:
+ # find a string label containing "Noise" and remove it
+ if isinstance(l, str) and "Noise" in l:
+ ulab.remove(l)
+ elif isinstance(segments[0], RawMessage) and segments[0].messageType != "Raw":
+ for l in ulab:
+ # find a -1 integer label and remove it
+ try:
+ if int(l) == -1:
+ ulab.remove(l)
+ except ValueError:
+ pass # not a problem, just keep the cluster, since its not noise.
+ return ulab
+
+
+ def plotManifoldDistances(self,
+ segments: List[Union[MessageSegment, TypedSegment, TypedTemplate, Template, RawMessage, Any]],
distances: numpy.ndarray,
- labels: numpy.ndarray, templates: List=None, plotEdges = False, countMarkers = False):
+ labels: numpy.ndarray,
+ templates: List=None, plotEdges = False, countMarkers = False):
+ # noinspection PyUnresolvedReferences
"""
Plot distances of segments according to (presumably multidimensional) features.
This function abstracts from the actual feature by directly taking a precomputed similarity matrix and
@@ -93,94 +200,37 @@ def plotManifoldDistances(self, segments: List[Union[MessageSegment, TypedSegmen
quickly becomes a huge load especially when rendering the plot as PDF.
:param countMarkers: add text labels with information at positions with multiple markers
"""
- from sklearn import manifold
- from sklearn.decomposition import PCA
-
- # plot configuration
- labsize = 150 # label markers: size factor
- typsize = 30 # type markers: size factor
- # self._cm # label color map
- fcm = cm.cubehelix # type color map
+ assert isinstance(segments, Sequence)
+ assert isinstance(distances, numpy.ndarray)
+ assert isinstance(labels, numpy.ndarray)
+ assert len(segments) == distances.shape[0] == distances.shape[1]
- # identify unique labels
- allabels = set(labels)
- if all(isinstance(l, numpy.integer) or l.isdigit() for l in allabels if l != "Noise"):
- ulab = sorted(allabels,
- key=lambda l: -1 if l == "Noise" else int(l))
- else:
- ulab = sorted(allabels)
-
- # subsample if segment count is larger than maxSamples
- maxSamples = 1000
- originalSegmentCount = len(segments)
- if originalSegmentCount > 2*maxSamples:
- import math
- ratiorev = originalSegmentCount / maxSamples
- step2keep = math.floor(ratiorev)
- lab2idx = dict()
- for idx, lab in enumerate(labels):
- if lab not in lab2idx:
- lab2idx[lab] = list()
- lab2idx[lab].append(idx)
- # copy list to remove elements without side-effects
- segments = segments.copy()
- # to save the indices to be removed
- idx2rem = list()
- # determines a subset evenly distributed over all clusters while honoring the ratio to reduce to.
- for lab, ics in lab2idx.items():
- keep = set(ics[::step2keep])
- idx2rem.extend(set(ics) - keep)
- idx2rem = sorted(idx2rem, reverse=True)
- for idx in idx2rem:
- del segments[idx]
- labels = numpy.delete(labels, idx2rem, 0)
- distances = numpy.delete(numpy.delete(distances, idx2rem, 0), idx2rem, 1)
- else:
- idx2rem = None
-
-
- # prepare MDS
- seed = numpy.random.RandomState(seed=3)
- mds = manifold.MDS(n_components=2, max_iter=3000, eps=1e-9, random_state=seed,
- dissimilarity="precomputed", n_jobs=1)
- pos = mds.fit(distances).embedding_
- # print(distances)
-
- # Rotate the data
- clf = PCA(n_components=2)
-
- pos = clf.fit_transform(pos)
-
-
- fig = self._fig
axMDS, axSeg = self._axes # type: plt.Axes, plt.Axes
+ axMDS.set_aspect('equal', adjustable='datalim')
- if idx2rem is not None:
- axSeg.text(0, -5, 'Subsampled: {} of {} segments'.format(len(segments), originalSegmentCount))
+ # subsample if segment count is larger than maxSamples
+ subret = self.subsample(segments, distances, labels)
+ if subret:
+ originalSegmentCount, segments, distances, labels = subret
+ if self._plotSegmentValues:
+ botlef = (0, -5)
+ else:
+ botlef = (0.1, 0.1)
+ axSeg.text(*botlef, 'Subsampled: {} of {} segments'.format(len(segments), originalSegmentCount))
+ # without subsampling, existing values need not to be overwritten
- # omit noise in cluster labels if types are plotted anyway.
- if any(isinstance(seg, (TypedSegment, TypedTemplate)) for seg in segments):
- for l in ulab:
- if isinstance(l, str) and "Noise" in l:
- ulab.remove(l)
- elif isinstance(segments[0], RawMessage) and segments[0].messageType != "Raw":
- for l in ulab:
- try:
- if int(l) == -1:
- ulab.remove(l)
- except ValueError as e:
- pass # not a problem, just keep the cluster, since its not noise.
+ pos = DistancesPlotter.manifoldPositions(distances)
- # prepare color space
- cIdx = [int(round(each)) for each in numpy.linspace(2, self._cm.N-2, len(ulab))]
+ # identify unique labels
+ ulab = DistancesPlotter.uniqueLabels(labels, segments)
if templates is None:
templates = ulab
- # iterate unique labels and scatter plot each of these clusters
+ # prepare color space
+ cIdx = [int(round(each)) for each in numpy.linspace(2, self.cm.N - 2, len(ulab))]
+
+ # CLUSTERS (large bobbles): iterate unique labels and scatter plot each of these clusters
for c, (l, t) in enumerate(zip(ulab, templates)): # type: int, (Any, Template)
- # test with:
- # color = [list(numpy.random.randint(0, 10, 4) / 10)]
- # plt.scatter(numpy.random.randint(0,10,4), numpy.random.randint(0,10,4), c=color)
- lColor = self._cm(cIdx[c])
+ lColor = self.cm(cIdx[c])
class_member_mask = (labels == l)
try:
x = list(compress(pos[:, 0].tolist(), class_member_mask))
@@ -188,7 +238,7 @@ def plotManifoldDistances(self, segments: List[Union[MessageSegment, TypedSegmen
# "If you want to specify the same RGB or RGBA value for all points, use a 2-D array with a single row."
# see https://matplotlib.org/api/_as_gen/matplotlib.pyplot.scatter.html:
axMDS.scatter(x, y, c=colors.to_rgba_array(lColor), alpha=.6,
- s = labsize,
+ s = self.labsize,
# s=s-(c*s/len(ulab)), #
lw=0, label=str(l))
except IndexError as e:
@@ -197,54 +247,56 @@ def plotManifoldDistances(self, segments: List[Union[MessageSegment, TypedSegmen
print(segments)
raise e
- if isinstance(t, Template):
+ if isinstance(t, Template) and self._plotSegmentValues:
axSeg.plot(t.values, c=lColor, linewidth=4)
-
- # include field type labels for TypedSegments input
+ # GROUND TRUTH (small bobbles): include field type labels for TypedSegments input
if any(isinstance(seg, (TypedSegment, TypedTemplate, RawMessage)) for seg in segments):
if any(isinstance(seg, (TypedSegment, TypedTemplate)) for seg in segments):
- ftypes = numpy.array([seg.fieldtype if isinstance(seg, (TypedSegment, TypedTemplate)) else "[unknown]" for seg in segments]) # PP
+ ftypes = numpy.array([seg.fieldtype if isinstance(seg, (TypedSegment, TypedTemplate))
+ else "[unknown]" for seg in segments]) # PP
elif any(isinstance(seg, RawMessage) and seg.messageType != 'Raw' for seg in segments):
- ftypes = numpy.array([msg.messageType if isinstance(msg, RawMessage) and msg.messageType != 'Raw' else "[unknown]" for msg in segments]) # PP
+ ftypes = numpy.array([msg.messageType if isinstance(msg, RawMessage) and msg.messageType != 'Raw'
+ else "[unknown]" for msg in segments]) # PP
else:
ftypes = set()
# identify unique types
utyp = sorted(set(ftypes))
# prepare color space
- # noinspection PyUnresolvedReferences
- cIdx = [int(round(each)) for each in numpy.linspace(30, fcm.N - 30, len(utyp))]
+ cIdx = [int(round(each)) for each in numpy.linspace(30, self.fcm.N - 30, len(utyp))]
# iterate unique types and scatter plot each of these groups
for n, ft in enumerate(utyp): # PP
- fColor = fcm(cIdx[n])
+ fColor = self.fcm(cIdx[n])
type_member_mask = (ftypes == ft)
x = list(compress(pos[:, 0].tolist(), type_member_mask))
y = list(compress(pos[:, 1].tolist(), type_member_mask))
# "If you want to specify the same RGB or RGBA value for all points, use a 2-D array with a single row."
# see https://matplotlib.org/api/_as_gen/matplotlib.pyplot.scatter.html:
axMDS.scatter(x, y, c=colors.to_rgba_array(fColor), alpha=1,
- s=typsize,
+ s=self.typsize,
lw=0, label=str(ft))
- if isinstance(segments[0], (TypedSegment, TypedTemplate)):
+ if isinstance(segments[0], (TypedSegment, TypedTemplate)) and self._plotSegmentValues:
for seg in compress(segments, type_member_mask):
axSeg.plot(seg.values, c=fColor, alpha=0.05)
- elif isinstance(segments[0], MessageSegment):
+ elif isinstance(segments[0], MessageSegment) and self._plotSegmentValues:
for c, l in enumerate(ulab):
- lColor = self._cm(cIdx[c])
+ lColor = self.cm(cIdx[c])
class_member_mask = (labels == l)
for seg in compress(segments, class_member_mask):
axSeg.plot(seg.values, c=lColor, alpha=0.1)
- else:
- axSeg.text(.5, .5, 'nothing to plot\n(message alignment)', horizontalalignment='center')
-
-
- # place the label/type legend at the best position
- if isinstance(segments[0], RawMessage):
- axMDS.legend(bbox_to_anchor=(1.04,1), scatterpoints=1, shadow=False)
+ elif self._plotSegmentValues:
+ axSeg.text(.5, .5, 'nothing to plot\n(message alignment)', horizontalalignment='center')
+
+ # place the label/type legend in the (otherwise empty) axSeg subfigure
+ if isinstance(segments[0], RawMessage) or not self._plotSegmentValues:
+ legendHandles, legendLabels = axMDS.get_legend_handles_labels()
+ # axMDS.legend(bbox_to_anchor=(1.04,1), scatterpoints=1, shadow=False)
+ axSeg.legend(handles=legendHandles, labels=legendLabels, loc='best', scatterpoints=1, shadow=False)
axSeg.patch.set_alpha(0.0)
axSeg.axis('off')
else:
+ # place the label/type legend at the best position
axMDS.legend(scatterpoints=1, loc='best', shadow=False)
@@ -264,7 +316,6 @@ def plotManifoldDistances(self, segments: List[Union[MessageSegment, TypedSegmen
lc.set_linewidths(0.5 * numpy.ones(len(segments)))
axMDS.add_collection(lc)
-
if countMarkers:
# Count markers at identical positions and plot text with information about the markers at this position
from collections import Counter
@@ -289,26 +340,25 @@ def plotManifoldDistances(self, segments: List[Union[MessageSegment, TypedSegmen
posYr = posY + r * math.sin(theta)
axMDS.text(posXr, posYr, "{}: {}".format(lab, cnt), withdash=True)
-
- fig.canvas.toolbar.update()
+ if self._fig.canvas.toolbar is not None:
+ self._fig.canvas.toolbar.update()
def _plot2dDistances(self, segments: List[MessageSegment], labels: List,
templates: List = None):
- fig = self._fig
axMDS, axSeg = self._axes
ulab = sorted(set(labels))
- cIdx = [each for each in numpy.linspace(0, self._cm.N - 2, len(ulab))]
+ cIdx = [each for each in numpy.linspace(0, self.cm.N - 2, len(ulab))]
if templates is None:
templates = ulab
coords = numpy.array([seg.values for seg in segments]) # type: numpy.ndarray
- s = 150 # size factor
+ # s = 150 # size factor
for c, (l, t) in enumerate(zip(ulab, templates)): # type: int, (Any, Template)
- lColor = self._cm(int(round(cIdx[c])))
+ lColor = self.cm(int(round(cIdx[c])))
class_member_mask = (labels == l)
try:
x = list(compress(coords[:, 0].tolist(), class_member_mask))
@@ -335,7 +385,8 @@ def _plot2dDistances(self, segments: List[MessageSegment], labels: List,
axMDS.legend(scatterpoints=1, loc='best', shadow=False)
- fig.canvas.toolbar.update()
+ if self._fig.canvas.toolbar is not None:
+ self._fig.canvas.toolbar.update()
diff --git a/src/nemere/visualization/multiPlotter.py b/src/nemere/visualization/multiPlotter.py
index bfca3169..7f58844c 100644
--- a/src/nemere/visualization/multiPlotter.py
+++ b/src/nemere/visualization/multiPlotter.py
@@ -199,7 +199,7 @@ def plotSubfigs(self, analysisResults: List[List[float]], subfigName: List[str]=
self.nameEachAx(subfigName)
if resultsLabel or compareLabel:
- plt.legend()
+ self._fig.legend()
# noinspection PyDefaultArgument
def printMessageBytes(self, messages: List[AbstractMessage], fontdict={'size': 2}):
@@ -277,7 +277,7 @@ def plotCorrelations(self,
# bvax.plot(analyzer.values,
# linewidth=.6, alpha=.6, c='blue', label='Bitvariances')
- plt.figlegend()
+ self._fig.legend()
def plotMultiSegmentLines(self, segmentGroups: List[Tuple[str, List[Tuple[str, TypedSegment]]]],
@@ -314,6 +314,13 @@ def plotToSubfig(self, subfigid: Union[int, plt.Axes], values: Union[List, numpy
"""
Plot values to selected subfigure.
+ >>> import nemere.visualization.multiPlotter
+ >>> import nemere.utils.loader
+ >>> import numpy
+ >>> loader = nemere.utils.loader.SpecimenLoader("input/maxdiff-fromOrig/dns_ictf2010_maxdiff-100.pcap")
+ >>> mmp = nemere.visualization.multiPlotter.MultiMessagePlotter(loader, "test", 4)
+ >>> mmp.plotToSubfig(2, numpy.random.poisson(5,100), numpy.random.poisson(5,100))
+
:param subfigid: Subfigure id to plot to.
:param values: Values to plot.
:param plotkwArgs: kwargs directly passed through to the pyplot plot function.
@@ -323,11 +330,12 @@ def plotToSubfig(self, subfigid: Union[int, plt.Axes], values: Union[List, numpy
def histoToSubfig(self, subfigid: int, data, **kwargs):
- self._axes.flat[subfigid].hist(data, **kwargs)
+ ret = self._axes.flat[subfigid].hist(data, **kwargs)
self._axes.flat[subfigid].legend()
+ return ret
- def writeOrShowFigure(self):
+ def writeOrShowFigure(self, plotfolder: str = None):
for sf in self.axes:
# deduplicate labels
handles, labels = sf.get_legend_handles_labels()
@@ -337,7 +345,7 @@ def writeOrShowFigure(self):
newLabels.append(label)
newHandles.append(handle)
sf.legend(newHandles, newLabels)
- super().writeOrShowFigure()
+ super().writeOrShowFigure(plotfolder)
diff --git a/src/nemere/visualization/plotter.py b/src/nemere/visualization/plotter.py
index 0f552227..2d34cbbb 100644
--- a/src/nemere/visualization/plotter.py
+++ b/src/nemere/visualization/plotter.py
@@ -3,6 +3,9 @@
import matplotlib.pyplot as plt
+from nemere.utils.evaluationHelpers import reportFolder
+
+
class MessagePlotter(object):
"""
Define basic functions and properties to plot messages.
@@ -32,6 +35,7 @@ def __init__(self, specimens: SpecimenLoader, analysisTitle: str, isInteractive:
self._specimens = specimens
self._title = analysisTitle
self._interactive = isInteractive
+ self._autoLegend = True
@property
def title(self) -> str:
@@ -41,18 +45,20 @@ def title(self) -> str:
# def figure(self):
# return self._figure
-
- def writeOrShowFigure(self):
+ def writeOrShowFigure(self, plotfolder: str=None):
"""
+ :param plotfolder: Folder to place the plot in. If not set, use reportFolder from nemere.utils.evaluationHelpers
+
If isInteractive was set to true, show the plot in a window, else write it to a file,
if none of the same name already exists. Closes all figures afterwards.
"""
- from nemere.utils.evaluationHelpers import reportFolder
-
+ if plotfolder is None:
+ plotfolder = reportFolder
pcapName = splitext(basename(self._specimens.pcapFileName))[0]
- plotfile = join(reportFolder, '{}_{}.pdf'.format(self._title, pcapName))
+ plotfile = join(plotfolder, '{}_{}.pdf'.format(self._title, pcapName))
- plt.legend()
+ if self._autoLegend:
+ plt.legend()
plt.suptitle('{} | {}'.format(pcapName, self._title))
plt.tight_layout(rect=[0,0,1,.95])
diff --git a/src/nemere/visualization/simplePrint.py b/src/nemere/visualization/simplePrint.py
index 34145008..9ad25870 100644
--- a/src/nemere/visualization/simplePrint.py
+++ b/src/nemere/visualization/simplePrint.py
@@ -1,11 +1,14 @@
+from itertools import chain
from typing import Tuple, Iterable, Sequence, Dict, List, Union
+
from tabulate import tabulate
from netzob.Common.Utils.MatrixList import MatrixList
from netzob.Model.Vocabulary.Messages.AbstractMessage import AbstractMessage
-from nemere.inference.segments import MessageSegment, TypedSegment
+from nemere.inference.segments import MessageSegment
from nemere.inference.templates import DistanceCalculator, Template
+from nemere.validation.dissectorMatcher import MessageComparator
from nemere.visualization import bcolors as bcolors
@@ -100,4 +103,309 @@ def markSegmentInMessage(segment: Union[MessageSegment, Template]):
markSegmentInMessage(bs)
+def markSegNearMatch(segment: Union[Iterable[MessageSegment], MessageSegment, Template],
+ segmentedMessages: List[Sequence[MessageSegment]],
+ comparator: MessageComparator,
+ withContext: Union[bool,int]=False):
+ """
+ Print messages with the given segment in each message marked (underlined).
+ Supports Templates by resolving them to their base segments.
+
+ :param comparator: Comparator representing the true message dissections.
+ :param withContext: if a integer value, print this number of bytes as context before and after the segment.
+ :param segmentedMessages: the inferred segments (list of tuples of messages) to overlay as colors.
+ :param segment: list of segments that should be printed, i. e.,
+ marked within the print of the message it is originated from.
+ """
+ if isinstance(segment, Template):
+ segs = segment.baseSegments
+ elif isinstance(segment, Iterable):
+ segs = segment
+ else:
+ segs = [segment]
+
+ # print() # one blank line for visual structure
+ for seg in segs:
+ inf4seg = inferred4segment(seg, segmentedMessages)
+ if isinstance(withContext, int):
+ context = (seg.offset - withContext, seg.nextOffset + withContext)
+ else:
+ context = None
+ cprinter = ComparingPrinter(comparator, [inf4seg])
+ cprinter.toConsole([seg.message], (seg.offset, seg.nextOffset), context)
+
+ # # a simpler approach - without true fields marked as spaces
+ # markSegmentInMessage(segment)
+
+ # # get field number of next true field
+ # tsm = trueSegmentedMessages[segment.message] # type: List[MessageSegment]
+ # fsnum, offset = 0, 0
+ # while offset < segment.offset:
+ # offset += tsm[fsnum].offset
+ # fsnum += 1
+ # markSegmentInMessage(trueSegmentedMessages[segment.message][fsnum])
+
+ # # limit to immediate segment context
+ # posSegMatch = None # first segment that starts at or after the recognized field
+ # for sid, seg in enumerate(trueSegmentedMessages[segment.message]):
+ # if seg.offset > segment.offset:
+ # posSegMatch = sid
+ # break
+ # posSegEnd = None # last segment that ends after the recognized field
+ # for sid, seg in enumerate(trueSegmentedMessages[segment.message]):
+ # if seg.nextOffset > segment.nextOffset:
+ # posSegEnd = sid
+ # break
+ # if posSegMatch is not None:
+ # contextStart = max(posSegMatch - 2, 0)
+ # if posSegEnd is None:
+ # posSegEnd = posSegMatch
+ # contextEnd = min(posSegEnd + 1, len(trueSegmentedMessages))
+
+
+def inferred4segment(segment: MessageSegment, segmentedMessages: List[Sequence[MessageSegment]]) \
+ -> Sequence[MessageSegment]:
+ """
+ Determine all the segments from segmentedMessages in the message of the one given segment.
+
+ :param segment: The input segment.
+ :param segmentedMessages: List of segmented messages to search in.
+ :return: All inferred segments for the message which the input segment is from.
+ """
+ return next(msegs for msegs in segmentedMessages if msegs[0].message == segment.message)
+
+
+class SegmentPrinter(object):
+ """
+ Printing of inferred segments within messages without ground truth,
+ """
+
+ def __init__(self, segmentsPerMsg: Sequence[Sequence[MessageSegment]]):
+ """
+ :param segmentsPerMsg: The segments that should be visualized by color changes or other optical features.
+ """
+ self._segmentedMessages = {msg[0].message:msg
+ for msg in segmentsPerMsg} # type: Dict[AbstractMessage, List[MessageSegment]]
+
+ @staticmethod
+ def _sliceMessageData(message: AbstractMessage, messageSlice: Tuple[Union[int,None],Union[int,None]]=None):
+ """
+ Slices a message into a selected substring and returns the absolute offsets of the cuts.
+
+ :param message: Message to slice something out.
+ :param messageSlice: Tuple used as parameters of the slice of a message.
+ Use None to create an open slice (up to the beginning or end of the message).
+ :return:
+ """
+ msglen = len(message.data)
+ absSlice = (
+ messageSlice[0] if messageSlice is not None and messageSlice[0] is not None else 0,
+ messageSlice[1] if messageSlice is not None and messageSlice[1] is not None else msglen
+ )
+ dataSnip = message.data if messageSlice is None else message.data[slice(*messageSlice)]
+ return dataSnip, absSlice
+
+ @staticmethod
+ def _prepareMark(mark: Union[Tuple[int,int], MessageSegment], absSlice: Tuple[int,int]):
+ if mark is not None:
+ if isinstance(mark, MessageSegment):
+ mark = mark.offset, mark.nextOffset
+ assert mark[0] >= absSlice[0], repr(mark) + "not valid with message slice" + repr(absSlice)
+ assert mark[1] <= absSlice[1], repr(mark) + "not valid with message slice" + repr(absSlice)
+ return mark
+
+ # noinspection PyMethodMayBeStatic
+ def _trueFieldEnds(self, message: AbstractMessage):
+ """
+ Just a dummy, for subcalsses to overwrite if there is ground truth to be printed there.
+
+ :param message: is ignored
+ :return: always a empty tuple
+ """
+ return ()
+
+ def _inferredFieldStarts(self, message: AbstractMessage):
+ ifs = [seg.offset for seg in self._segmentedMessages[message]]
+ return ifs
+
+ def _inferredFieldEnds(self, message: AbstractMessage):
+ ife = [seg.nextOffset for seg in self._segmentedMessages[message]]
+ return ife
+
+ def toConsole(self, selectMessages: Iterable[AbstractMessage]=None, mark: Union[Tuple[int,int], MessageSegment]=None,
+ messageSlice: Tuple[Union[int,None],Union[int,None]]=None):
+ """
+ :param selectMessages: The messages from which to print the byte hex values. Also used to look up the
+ true field boundaries, if any, to mark by spaces between in the printed byte hex values.
+ :param mark: Start and end indices of a range to mark by underlining.
+ :param messageSlice: Tuple used as parameters of the slice builtin to select a subset of all messages to print.
+ Use None to create an open slice (up to the beginning or end of the message).
+ """
+ import nemere.visualization.bcolors as bc
+
+ for msg in selectMessages if selectMessages is not None else self._segmentedMessages.keys():
+ dataSnip, absSlice = SegmentPrinter._sliceMessageData(msg, messageSlice)
+ mark = SegmentPrinter._prepareMark(mark, absSlice)
+
+ tfe = self._trueFieldEnds(msg)
+ # inferred segments starts and ends (not necessarily covering the whole message!)
+ ifs = self._inferredFieldStarts(msg)
+ ife = self._inferredFieldEnds(msg)
+
+ hexdata = list() # type: List[str]
+ lastcolor = None
+ for po, by in enumerate(dataSnip, absSlice[0]):
+ # end mark
+ if mark is not None and po == mark[1]:
+ hexdata.append(bc.ENDC)
+ # restart color after mark end
+ if lastcolor is not None and lastcolor < po and po not in ifs and po not in ife:
+ hexdata.append(bc.eightBitColor(lastcolor % 231 + 1))
+
+ # have a space in place of each true field end in the hex data.
+ if po in tfe:
+ hexdata.append(' ')
+
+ # clear color at segment end
+ if po in ife:
+ lastcolor = None
+ hexdata.append(bc.ENDC)
+ # restart mark after color change
+ if mark is not None and mark[0] < po < mark[1]:
+ hexdata.append(bc.UNDERLINE)
+
+ # have a different color per each inferred field
+ if po in ifs:
+ assert lastcolor is None, "Some segment overlap prevented unambiguous coloring."
+ if po < absSlice[1]:
+ lastcolor = po
+ hexdata.append(bc.eightBitColor(po % 231 + 1))
+
+ # start mark
+ if mark is not None and po == mark[0]:
+ hexdata.append(bc.UNDERLINE)
+
+ # add the actual value
+ hexdata.append('{:02x}'.format(by))
+ hexdata.append(bc.ENDC)
+
+ print(''.join(hexdata).strip())
+
+ _basestyles = ["every node/.style={font=\\ttfamily, text height=.7em, outer sep=0, inner sep=0}",
+ "tfe/.style={draw, minimum height=1.2em, thick}",
+ "tfelabel/.style={rotate=-20, anchor=north west}",
+ "nonelabel/.style={}"]
+ _texhead = "\n\\begin{tikzpicture}[node distance=0pt, yscale=2,\n"
+ _texfoot = """
+ \end{tikzpicture}
+
+ \\centering
+ \\bigskip\ninferred fields: framed box
+
+ """
+ _tfemarker = '1ex '
+
+ def _msgoffs2label(self, msg, po):
+ return ""
+
+ def toTikz(self, selectMessages: Iterable[AbstractMessage] = None, styles = None):
+ if styles is None:
+ styles = type(self)._basestyles.copy()
+ else:
+ styles = type(self)._basestyles.copy() + styles
+
+ # start filling the texcode variable
+ texcode = type(self)._texhead
+ texcode += ",\n".join(styles) + "]"
+ for msgid, msg in enumerate(selectMessages if selectMessages is not None else self._segmentedMessages.keys()):
+ # true fields infos
+ tfe = self._trueFieldEnds(msg)
+ # inferred segments list (not necessarily covering the whole message!)
+ isegs = self._segmentedMessages[msg] if msg in self._segmentedMessages else []
+
+ hexdata = list() # type: List[str]
+ hexdata.append('\n\n\\coordinate(m{}f0) at (0,{});'.format(msgid, -msgid))
+ for po, by in enumerate(msg.data, start=1):
+ # add the actual value
+ hexdata.append('\\node[right={}of m{}f{}, {}] (m{}f{}) {{{:02x}}};'.format(
+ # have a 1ex space in place of each true field end in the hex data.
+ type(self)._tfemarker if po - 1 in tfe else '', msgid, po - 1,
+ # style for some label at this offset
+ self._msgoffs2label(msg, po),
+ msgid, po, by)
+ )
+ texcode += '\n'.join(hexdata)
+
+ # have a frame around each inferred field
+ fitnodes = list()
+ for seg in isegs:
+ fitnodes.append(
+ f'\\node[fit=(m{msgid}f{seg.offset + 1})(m{msgid}f{seg.nextOffset}), tfe] {{}};'
+ )
+ texcode += '\n' + '\n'.join(fitnodes)
+
+ texcode += type(self)._texfoot
+ return texcode + "\n"
+
+ def toTikzFile(self, selectMessages: Iterable[AbstractMessage] = None, styles = None, folder = None):
+ from os.path import join, isdir, exists
+ if folder is None:
+ from ..utils.evaluationHelpers import reportFolder
+ folder = reportFolder
+ if not isdir(folder):
+ raise NotADirectoryError(
+ "The reports folder {} is not a directory. Reports cannot be written there.".format(
+ folder))
+ print('Write tikz to ' + folder)
+ filename = join(folder, 'inferredMessages.tikz')
+ if exists(filename):
+ raise FileExistsError("File already exists. Abort write of tikz file.")
+ with open(filename, 'w') as tikzfile:
+ tikzfile.write(self.toTikz(selectMessages, styles))
+
+
+class ComparingPrinter(SegmentPrinter):
+ """
+ Routines to generate beautiful human-readable representations of true and inferred message syntax for comparison.
+ """
+ def __init__(self, comparator: MessageComparator, segmentsPerMsg: Sequence[Sequence[MessageSegment]]):
+ super().__init__(segmentsPerMsg)
+ self._comparator = comparator
+ """map of messages to their inferred segments, filled by _mapMessages2Segments."""
+
+ def __colorlabels(self, selectMessages: List[AbstractMessage]):
+ """Needs to return a set of all possible color labels to be returned by _offset2colorlabel."""
+ return {t[0] for msg in selectMessages for t in
+ self._comparator.parsedMessages[self._comparator.messages[msg]].getTypeSequence()}
+
+ def _offset2colorlabel(self, message):
+ """offset2type: style label for the field type"""
+ pm = self._comparator.parsedMessages[self._comparator.messages[message]]
+ typeSequence = pm.getTypeSequence()
+ return list(chain.from_iterable([lab] * lgt for lab, lgt in typeSequence))
+
+ def _offset2textlabel(self, message):
+ """offset2name: true field name as labels"""
+ pm = self._comparator.parsedMessages[self._comparator.messages[message]]
+ trueFieldNameMap = dict()
+ offset = 0
+ for name, lgt in pm.getFieldSequence():
+ trueFieldNameMap[offset] = name.replace("_", "\\_")
+ offset += lgt
+ return trueFieldNameMap
+
+ def _trueFieldEnds(self, message: AbstractMessage):
+ pm = self._comparator.parsedMessages[self._comparator.messages[message]]
+ typeSequence = pm.getTypeSequence()
+ # typeSequence = self.dissections[self._comparator.messages[message]] # dissections uses RawMessage as keys
+ return MessageComparator.fieldEndsFromLength([l for t, l in typeSequence])
+
+ _texfoot = """
+ \end{tikzpicture}
+
+ \\centering
+ \\bigskip\ntrue fields: SPACE | inferred fields: framed box
+
+ """
+
diff --git a/src/nemere/visualization/singlePlotter.py b/src/nemere/visualization/singlePlotter.py
index 330f7618..0aa06500 100644
--- a/src/nemere/visualization/singlePlotter.py
+++ b/src/nemere/visualization/singlePlotter.py
@@ -208,6 +208,7 @@ def histogramFieldEnds(self, symbols: List[Symbol]):
for symbol in symbols:
for message in symbol.messages: # type: AbstractMessage
maxLen = max(maxLen, len(message.data))
+ # TODO catch WatchdogTimeout in callers of fieldEndsPerSymbol
cumulatedFieldEnds.update(
MessageComparator.fieldEndsPerSymbol(symbol, message)[:-1]) # omit message end
diff --git a/src/nemesys.py b/src/nemesys.py
index e25aa2ac..ee0eed21 100644
--- a/src/nemesys.py
+++ b/src/nemesys.py
@@ -1,20 +1,23 @@
"""
+Reference implementation for calling NEMESYS: NEtwork MEssage Syntax analysYS with an unknown protocol.
+Usenix WOOT 2018.
+
Infer messages from PCAPs by the NEMESYS approach (BCDG-segmentation)
and write inference result to the terminal. Finally drop to an IPython console
and expose API to interact with the result.
-
-Usenix WOOT 2018.
"""
import argparse, time
from os.path import isfile
+from itertools import islice
+from typing import List
+
import IPython
+from netzob.Model.Vocabulary.Symbol import Symbol
from nemere.utils.loader import SpecimenLoader
from nemere.inference.segmentHandler import bcDeltaGaussMessageSegmentation, refinements, symbolsFromSegments
-
-
-
+import nemere.visualization.simplePrint as sP
if __name__ == '__main__':
parser = argparse.ArgumentParser(
@@ -51,11 +54,22 @@
print('Segmented and refined in {:.3f}s'.format(time.time() - startsegmentation))
symbols = symbolsFromSegments(segmentsPerMsg)
- refinedSymbols = symbolsFromSegments(refinedPerMsg)
+ refinedSymbols = symbolsFromSegments(refinedPerMsg) # type: List[Symbol]
+
+ # output visualization of at most 100 messages on terminal and all into file
+ segprint = sP.SegmentPrinter(refinedPerMsg)
+ segprint.toConsole(islice(specimens.messagePool.keys(),100))
+ # segprint.toTikzFile()
+ # omit messages longer than 200 bytes (and not more than 100 messages)
+ segprint.toTikzFile(islice((msg for msg in specimens.messagePool.keys() if len(msg.data) < 200), 100))
+ # available for output:
+ # * nemere.utils.reportWriter.writeSegmentedMessages2CSV
+ # * from netzob.Export.WiresharkDissector.WiresharkDissector import WiresharkDissector
+ # WiresharkDissector.dissectSymbols(refinedSymbols, 'ari.lua')
+
- # TODO output (colored?) visualization on terminal
+ print("\nAccess inferred symbols via variables: symbols, refinedSymbols")
+ print("Access inferred message segments via variables: segmentsPerMsg, refinedPerMsg\n")
- print("Access inferred symbols via variables: symbols, refinedSymbols")
- print("Access inferred message segments via variables: segmentsPerMsg, refinedPerMsg")
IPython.embed()
\ No newline at end of file
diff --git a/src/nemesys_field-deviation-plot.py b/src/nemesys_field-deviation-plot.py
index 48c1746b..461fca06 100644
--- a/src/nemesys_field-deviation-plot.py
+++ b/src/nemesys_field-deviation-plot.py
@@ -32,7 +32,7 @@
parser.add_argument('-l', '--layer', type=int, default=2,
help='Protocol layer relative to IP to consider. Default is 2 layers above IP '
'(typically the payload of a transport protocol).')
- parser.add_argument('-r', '--relativeToIP', default=False, action='store_true')
+ parser.add_argument('-r', '--relativeToIP', default=True, action='store_true')
parser.add_argument('-c', '--columns', type=int, default=2,
help='Adjust width/aspect ratio for use in one USENIX column wide plot (1) or '
'for one USENIX column sideways leaving space for the caption (2)')
diff --git a/src/nemesys_fms.py b/src/nemesys_fms.py
index a7fbd470..21aa7cdd 100644
--- a/src/nemesys_fms.py
+++ b/src/nemesys_fms.py
@@ -16,9 +16,8 @@
from nemere.utils.loader import SpecimenLoader
from nemere.inference.analyzers import *
from nemere.inference.segmentHandler import bcDeltaGaussMessageSegmentation, \
- baseRefinements, originalRefinements, symbolsFromSegments
-from nemere.validation import reportWriter
-
+ baseRefinements, symbolsFromSegments
+from nemere.utils import reportWriter
debug = False
"""Some modules and methods contain debug output that can be activated by this flag."""
diff --git a/src/nemetyl.py b/src/nemetyl.py
new file mode 100644
index 00000000..a5e77b9c
--- /dev/null
+++ b/src/nemetyl.py
@@ -0,0 +1,195 @@
+"""
+Reference implementation for calling NEMETYL, the NEtwork MEssage TYpe identification by aLignment
+with an unknown protocol.
+INFOCOM 2020.
+
+Use different segmenters to tokenize messages and align segments on the similarity of their "personality"
+derived from the segments' features. The selected segmenter yields segments from each message. These segments are
+analyzed by the given analysis method which is used as feature to determine their similarity.
+Similar fields are then aligned to determine a score that is used as affinity value (dissimilarities) of messages
+for clustering. The clusters are refined by splitting and merging on heuristics.
+"""
+
+import argparse
+import csv
+import os
+
+import IPython
+
+from nemere.alignment.alignMessages import TypeIdentificationByAlignment
+from nemere.inference.segmentHandler import originalRefinements, nemetylRefinements
+from nemere.utils.evaluationHelpers import StartupFilecheck, CachedDistances, TitleBuilder, writePerformanceStatistics
+
+# https://stackoverflow.com/questions/15639779/why-does-multiprocessing-use-only-a-single-core-after-i-import-numpy
+os.system("taskset -p 0xffffffffffffffffff %d" % os.getpid())
+
+debug = False
+
+# fix the analysis method to VALUE
+analysis_method = 'value'
+# fix the distance method to canberra
+distance_method = 'canberra'
+# tokenizers to select from
+tokenizers = ('4bytesfixed', 'nemesys')
+roundingprecision = 10**8
+# refinement methods
+refinementMethods = [
+ "none",
+ "original", # WOOT2018 paper
+ "nemetyl", # INFOCOM2020 paper: ConsecutiveChars+moco+splitfirstseg
+ ]
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(
+ description='Analyze fields as segments of messages and align on the similarity of their "personality" '
+ 'derived from the segments\' features.')
+ parser.add_argument('pcapfilename', help='Filename of the PCAP to load.')
+ parser.add_argument('-i', '--interactive', help='Open ipython prompt after finishing the analysis.',
+ action="store_true")
+ parser.add_argument('-l', '--layer', type=int, default=2,
+ help='Protocol layer relative to IP to consider. Default is 2 layers above IP '
+ '(typically the payload of a transport protocol).')
+ parser.add_argument('-r', '--relativeToIP', default=False, action='store_true')
+ parser.add_argument('-t', '--tokenizer', help='Select the tokenizer for this analysis run.',
+ choices=tokenizers, default="nemesys")
+ parser.add_argument('-s', '--sigma', type=float,
+ help='Only NEMESYS: sigma for noise reduction (gauss filter), default: 0.9')
+ parser.add_argument('-f', '--refinement', help='Select segment refinement method.', choices=refinementMethods,
+ default=refinementMethods[-1])
+ parser.add_argument('-p', '--with-plots', help='Generate plots.', action="store_true")
+ args = parser.parse_args()
+
+ filechecker = StartupFilecheck(args.pcapfilename)
+ withplots = args.with_plots
+ littleendian = args.littleendian == True
+ tokenizer = args.tokenizer
+ if littleendian:
+ tokenizer += "le"
+
+ # # # # # # # # # # # # # # # # # # # # # # # #
+ # Cache/load the segmentation and segment dissimilarities
+ # to/from the filesystem to improve performance of repeated analyses of the same trace.
+ # # # # # # # # # # # # # # # # # # # # # # # #
+ fromCache = CachedDistances(args.pcapfilename, analysis_method, args.layer, args.relativeToIP)
+ # Note! When manipulating distances calculation, deactivate caching by uncommenting the following assignment.
+ # fromCache.disableCache = True
+ fromCache.debug = debug
+ # As we analyze a truly unknown protocol, tell CachedDistances that it should not try to use tshark to obtain
+ # a dissection. The switch may be set to true for evaluating the approach with a known protocol.
+ # see src/nemetyl_align-segments.py
+ fromCache.dissectGroundtruth = False
+ fromCache.configureTokenizer(tokenizer, args.sigma)
+ if tokenizer[:7] == "nemesys":
+ if args.refinement == "original":
+ fromCache.configureRefinement(originalRefinements)
+ elif args.refinement == "nemetyl":
+ fromCache.configureRefinement(nemetylRefinements)
+ elif args.refinement is None or args.refinement == "none":
+ print("No refinement selected. Performing raw segmentation.")
+ else:
+ print(f"The refinement {args.refinement} is not supported with this tokenizer. Abort.")
+ exit(2)
+ fromCache.get()
+ segmentedMessages = fromCache.segmentedMessages
+ specimens, _, dc = fromCache.specimens, fromCache.comparator, fromCache.dc
+ segments = dc.rawSegments
+ segmentationTime, dist_calc_segmentsTime = fromCache.segmentationTime, fromCache.dist_calc_segmentsTime
+
+
+ # # # # # # # # # # # # # # # # # # # # # # # #
+ # Start the NEMETYL inference process
+ tyl = TypeIdentificationByAlignment(dc, segmentedMessages, tokenizer, specimens.messagePool)
+ # # # # # # # # # # # # # # # # # # # # # # # #
+
+
+ # # # # # # # # # # # # # # # # # # # # # # # #
+ # Calculate Alignment-Score and CLUSTER messages
+ tyl.clusterMessages()
+ # Prepare basic information about the inference run for the report
+ inferenceParams = TitleBuilder(tokenizer, args.refinement, args.sigma, tyl.clusterer)
+ # # # # # # # # # # # # # # # # # # # # # # # #
+ if withplots:
+ # plot message distances and clusters
+ print("Plot distances...")
+ from nemere.visualization.distancesPlotter import DistancesPlotter
+ dp = DistancesPlotter(specimens, 'message-distances_' + inferenceParams.plotTitle, False)
+ dp.plotManifoldDistances(
+ [specimens.messagePool[seglist[0].message] for seglist in segmentedMessages],
+ tyl.sm.distances, tyl.labels) # segmentedMessages
+ dp.writeOrShowFigure(filechecker.reportFullPath)
+
+
+ # # # # # # # # # # # # # # # # # # # # # # # #
+ # ALIGN cluster members
+ tyl.alignClusterMembers()
+ # # # # # # # # # # # # # # # # # # # # # # # #
+
+
+ # # # # # # # # # # # # # # # # # # # # # # # #
+ # SPLIT clusters based on fields without rare values
+ inferenceParams.postProcess = tyl.splitClusters()
+ # # # # # # # # # # # # # # # # # # # # # # # #
+ if withplots:
+ # plot distances and message clusters
+ print("Plot distances...")
+ from nemere.visualization.distancesPlotter import DistancesPlotter
+ dp = DistancesPlotter(specimens, 'message-distances_' + inferenceParams.plotTitle, False)
+ dp.plotManifoldDistances(
+ [specimens.messagePool[seglist[0].message] for seglist in segmentedMessages],
+ tyl.sm.distances, tyl.labels) # segmentedMessages
+ dp.writeOrShowFigure(filechecker.reportFullPath)
+
+
+ # # # # # # # # # # # # # # # # # # # # # # # #
+ # Check for cluster MERGE candidates
+ inferenceParams.postProcess = tyl.mergeClusters()
+ # # # # # # # # # # # # # # # # # # # # # # # #
+ if withplots:
+ # plot distances and message clusters
+ print("Plot distances...")
+ from nemere.visualization.distancesPlotter import DistancesPlotter
+ dp = DistancesPlotter(specimens, 'message-distances_' + inferenceParams.plotTitle, False)
+ dp.plotManifoldDistances(
+ [specimens.messagePool[seglist[0].message] for seglist in segmentedMessages],
+ tyl.sm.distances, tyl.labels) # segmentedMessages
+ dp.writeOrShowFigure(filechecker.reportFullPath)
+
+
+ # # # # # # # # # # # # # # # # # # # # # # # #
+ # Write results to report
+ # # # # # # # # # # # # # # # # # # # # # # # #
+ writePerformanceStatistics(
+ specimens, tyl.clusterer, inferenceParams.plotTitle,
+ segmentationTime, dist_calc_segmentsTime,
+ tyl.dist_calc_messagesTime, tyl.cluster_params_autoconfTime, tyl.cluster_messagesTime, tyl.align_messagesTime
+ )
+ filechecker.writeReportMetadata(fromCache.dccachefn if fromCache.isLoaded else None)
+
+ # # # # # # # # # # # # # # # # # # # # # # # #
+ # write alignments to csv
+ # # # # # # # # # # # # # # # # # # # # # # # #
+ csvpath = os.path.join(filechecker.reportFullPath,
+ f"NEMETYL-symbols-{inferenceParams.plotTitle}-{filechecker.pcapstrippedname}.csv")
+ if not os.path.exists(csvpath):
+ print('Write alignments to {}...'.format(csvpath))
+ with open(csvpath, 'w') as csvfile:
+ symbolcsv = csv.writer(csvfile)
+ symbolcsv.writerow(["Cluster", "Type", "frame.time_epoch", "Field", "Alignment"])
+ for clunu, clusg in tyl.alignedClusters.items():
+ symbolcsv.writerows(
+ [clunu, "unknown"] # cluster label # message type string from gt
+ + [next(seg for seg in msg if seg is not None).message.date] # frame.time_epoch
+ + [sg.bytes.hex() if sg is not None else '' for sg in msg] for msg in clusg
+ )
+ else:
+ print("Symbols not saved. File {} already exists.".format(csvpath))
+ if not args.interactive:
+ IPython.embed()
+ # # # # # # # # # # # # # # # # # # # # # # # #
+
+
+ if args.interactive:
+ # noinspection PyUnresolvedReferences
+ from tabulate import tabulate
+ # globals().update(locals())
+ IPython.embed()
diff --git a/src/nemetyl_align-segments.py b/src/nemetyl_align-segments.py
index efba7ca5..ed7ebd95 100644
--- a/src/nemetyl_align-segments.py
+++ b/src/nemetyl_align-segments.py
@@ -1,39 +1,47 @@
"""
-Use groundtruth about field segmentation by dissectors and align segments
-on the similarity of their feature "personality".
+Evaluation implementation for NEMETYL, the NEtwork MEssage TYpe identification by aLignment.
+INFOCOM 2020.
-Takes a PCAP trace of a known protocol, dissects each message into their fields, and yields segments from each of them.
-These segments get analyzed by the given analysis method which is used as feature to determine their similarity.
-Similar fields are then aligned.
+Use different segmenters to tokenize messages and align segments on the similarity of their "personality"
+derived from the segments' features.
+This script uses groundtruth about field segmentation by tshark dissectors to evaluate the quality.
+Thus, it takes a PCAP trace of a known protocol, dissects each message into their fields and compares the results to
+the selected heuristic segmenter.
+
+The selected segmenter yields segments from each message. These segments are analyzed by the given analysis method
+which is used as feature to determine their similarity. Similar fields are then aligned.
"""
import argparse, IPython
-from os.path import isfile, splitext, basename, exists, join
-import numpy
-from tabulate import tabulate
-from nemere.alignment.alignMessages import SegmentedMessages
-from nemere.inference.segmentHandler import originalRefinements, baseRefinements
+from nemere.alignment.alignMessages import TypeIdentificationByAlignment
+from nemere.inference.segmentHandler import originalRefinements, baseRefinements, \
+ nemetylRefinements
from nemere.alignment.hirschbergAlignSegments import HirschbergOnSegmentSimilarity
-from nemere.inference.segments import MessageSegment
-from nemere.inference.templates import DistanceCalculator
from nemere.utils.evaluationHelpers import *
+from nemere.utils.reportWriter import IndividualClusterReport, CombinatorialClustersReport
from nemere.visualization.multiPlotter import MultiMessagePlotter
-from nemere.alignment.clusterMerging import ClusterMerger
-from nemere.utils.baseAlgorithms import ecdf
+from nemere.alignment.clusterMerging import ClusterClusterer
+
+# https://stackoverflow.com/questions/15639779/why-does-multiprocessing-use-only-a-single-core-after-i-import-numpy
+os.system("taskset -p 0xffffffffffffffffff %d" % os.getpid())
debug = False
+# fix the analysis method to VALUE
analysis_method = 'value'
+# fix the distance method to canberra
distance_method = 'canberra'
+# tokenizers to select from
tokenizers = ('tshark', '4bytesfixed', 'nemesys')
roundingprecision = 10**8
-
# refinement methods
refinementMethods = [
+ "none",
"original", # WOOT2018 paper
- "base", # moco+splitfirstseg
+ "base", # ConsecutiveChars+moco
+ "nemetyl", # ConsecutiveChars+moco+splitfirstseg
]
@@ -54,7 +62,6 @@ def relign(segseqA, segseqB):
def columnOfAlignment(alignedSegments: List[List[MessageSegment]], colnum: int):
return [msg[colnum] for msg in alignedSegments]
-
# noinspection PyShadowingNames
def column2first(dc: DistanceCalculator, alignedSegments: List[List[MessageSegment]], colnum: int):
"""
@@ -71,7 +78,9 @@ def column2first(dc: DistanceCalculator, alignedSegments: List[List[MessageSegme
nonepos = [idx for idx, seg in enumerate(column) if seg is None]
stripedcol = [seg for seg in column if seg is not None]
- dists2first = ["- (reference)"] + list(dc.distancesSubset(stripedcol[0:1], stripedcol[1:]).tolist())[0] # type: List[Union[str, None]]
+ # noinspection PyTypeChecker
+ disulist = list(dc.distancesSubset(stripedcol[0:1], stripedcol[1:]).tolist()) # type: list
+ dists2first = ["- (reference)"] + disulist[0] # type: List[Union[str, None]]
# re-insert Nones
for idx in nonepos:
@@ -85,7 +94,6 @@ def printSegDist(d2ft: List[Tuple[MessageSegment, float]]):
print(tabulate([(s.bytes.hex() if isinstance(s, MessageSegment) else "-", d) for s, d in d2ft],
headers=['Seg (hex)', 'Distance'], floatfmt=".4f"))
-
# noinspection PyShadowingNames
def seg2seg(dc: DistanceCalculator, alignedSegments: List[List[MessageSegment]],
coordA: Tuple[int, int], coordB: Tuple[int, int]):
@@ -104,18 +112,18 @@ def seg2seg(dc: DistanceCalculator, alignedSegments: List[List[MessageSegment]],
print(segB)
return dc.pairDistance(segA, segB)
-
# noinspection PyShadowingNames
def quicksegmentTuple(dc: DistanceCalculator, segment: MessageSegment):
return dc.segments2index([segment])[0], segment.length, tuple(segment.values)
-def epsautoconfeval(epsilon):
+def epsautoconfeval(epsilon, plotTitle):
"""
investigate distance properties for clustering autoconfiguration
plots of k-nearest-neighbor distance histogram and "knee"
See SegmentedMessages#autoconfigureDBSCAN
+ :param plotTitle: Part of plot's filename and header
:param epsilon The manually determined "best" epsilon for comparison
:return:
"""
@@ -128,9 +136,9 @@ def epsautoconfeval(epsilon):
# hstplt.writeOrShowFigure()
# del hstplt
- neighbors = sm.neighbors() # list of tuples: (index from sm.distances, distance) sorted by distance
+ neighbors = tyl.sm.neighbors() # list of tuples: (index from sm.distances, distance) sorted by distance
- mmp = MultiMessagePlotter(specimens, tokenizer + "-knn-distance-funtion", 1, 2,
+ mmp = MultiMessagePlotter(specimens, "knn-distance-funtion_" + plotTitle, 1, 2,
isInteractive=False)
mmp.axes[0].axhline(epsilon, label="manually determined eps={:0.2f}".format(epsilon), c="red")
mmp.axes[1].axhline(epsilon, label="manually determined eps={:0.2f}".format(epsilon), c="red")
@@ -151,7 +159,7 @@ def epsautoconfeval(epsilon):
sigma = log(len(neighbors))
knearest = dict()
smoothknearest = dict()
- seconddiff = dict()
+ seconddiff = dict() # type: Dict[int, numpy.ndarray]
seconddiffMax = (0, 0, 0)
# ksteepeststats = list()
@@ -164,6 +172,7 @@ def epsautoconfeval(epsilon):
# max of second difference (maximum upwards curvature) as knee
seconddiff[k] = numpy.diff(smoothknearest[k], 2)
seconddiffargmax = seconddiff[k].argmax()
+ # noinspection PyArgumentList
diffrelmax = seconddiff[k].max() / smoothknearest[k][seconddiffargmax]
if 2*sigma < seconddiffargmax < len(neighbors) - 2*sigma and diffrelmax > seconddiffMax[2]:
seconddiffMax = (k, seconddiffargmax, diffrelmax)
@@ -191,7 +200,8 @@ def epsautoconfeval(epsilon):
ax0twin = mmp.axes[0].twinx()
# mmp.plotToSubfig(ax0twin, seconddiff[k], linestyle='dotted', color='cyan', alpha=.4)
- mmp.plotToSubfig(ax0twin, [None] + list(seconddiff[k].tolist()), linestyle='dotted',
+ # noinspection PyTypeChecker
+ mmp.plotToSubfig(ax0twin, [None] + seconddiff[k].tolist(), linestyle='dotted',
color='magenta', alpha=.4)
# epsilon = knearest[k][x]
@@ -203,7 +213,7 @@ def epsautoconfeval(epsilon):
mmp.axes[0].axvline(x, linestyle='dashed', color='blue', alpha=.4)
- mmp.writeOrShowFigure()
+ mmp.writeOrShowFigure(filechecker.reportFullPath)
del mmp
# if args.interactive:
@@ -213,7 +223,106 @@ def epsautoconfeval(epsilon):
return epsilon
+def clusterClusters():
+ """
+ alternative idea of merging clusters by clustering them:
+ does not improve merging - perhaps the similarity matrix is not good enough?!
+ :return:
+ """
+ # ClusterClusterer
+ clusterclusterer = ClusterClusterer(tyl.alignedClusters, dc)
+ # clusterDists = clusterclusterer.calcClusterDistances()
+
+ mergeEps, mergeMpts = clusterclusterer.autoconfigureDBSCAN()
+
+ cluclu, labels, mergeclusterer = clusterclusterer.clusterMessageTypesDBSCAN(mergeEps, min_samples=2)
+ clusterClustersNoiseless = {k: v for k, v in cluclu.items() if k > -1}
+ mergedClusters = ClusterClusterer.mergeClusteredClusters(clusterClustersNoiseless, tyl.messageObjClusters)
+ ClusterClusterer.printShouldMerge(list(clusterClustersNoiseless.values()), splitClusterReport.precisionRecallList)
+ mergedObjClusters = {lab: [comparator.messages[element[0].message] for element in segseq]
+ for lab, segseq in mergedClusters.items()}
+
+ inferenceParams.postProcess += "split+mergedAlt-{}-eps={:.2f}-min_samples={}".format(
+ type(mergeclusterer).__name__, mergeclusterer.eps, mergeclusterer.min_samples)
+ clusteredClusterReport = IndividualClusterReport(groundtruth, filechecker)
+ clusteredClusterReport.write(mergedObjClusters, inferenceParams.dict)
+
+ from netzob.Model.Vocabulary.Messages.RawMessage import RawMessage
+ from nemere.visualization.distancesPlotter import DistancesPlotter
+ typedClusterDummys = list()
+ for clun in clusterclusterer.clusterOrder:
+ clusta = None
+ for stats in clusteredClusterReport.precisionRecallList:
+ if stats is not None and stats[0] == clun:
+ clusta = stats[1] if stats[2] == 1.0 else "({})".format(stats[1])
+ break
+ msgdum = RawMessage(messageType=clusta)
+ typedClusterDummys.append(msgdum)
+
+ dipl = DistancesPlotter(specimens, "cluster-clustering-" + inferenceParams.plotTitle, False)
+ dipl.plotManifoldDistances(typedClusterDummys, clusterclusterer.distances, labels)
+ dipl.writeOrShowFigure(filechecker.reportFullPath)
+
+def printm4c(clusters, selected, printer):
+ """
+ Print messages for clusters.
+ :param clusters:
+ :param selected:
+ :param printer: callable with one parameter for the message
+ :return:
+ """
+ for lab, msgs in clusters.items():
+ if lab in selected:
+ print("\n\nCluster", lab, "\n")
+ for msg in msgs:
+ printer(msg)
+
+def printVals4Field(clusters, selected, field):
+ """all values for the given field name in the cluster"""
+ printm4c(clusters, selected,
+ lambda msg: print(comparator.parsedMessages[specimens.messagePool[msg[0].message]].getValuesByName(field)))
+
+def printSpecClu(clusters, selected):
+ printm4c(clusters, selected,
+ lambda msg: comparator.pprint2Interleaved(msg, messageSlice=(None, 100)))
+
+def countVals4Field(clusters, selected, field):
+ valCounter = dict()
+ for lab, msgs in clusters.items():
+ if lab in selected:
+ print("\n\nCluster", lab, "\n")
+ valCounter[lab] = Counter(chain.from_iterable(
+ comparator.parsedMessages[specimens.messagePool[msg[0].message]].getValuesByName(
+ field) for msg in msgs))
+ print(valCounter[lab])
+ # psftck = [set(tc.keys()) for tc in valCounter.values()]
+ # # these tag values are present in all psf messages
+ # psfcommon = psftck[0].intersection(*psftck)
+ # [[b for b in a if b not in psfcommon] for a in psftck]
+ return valCounter
+
+def singularFields(clusters, cmp, select):
+ fields = defaultdict(lambda: defaultdict(set))
+ for lab, msgs in clusters.items():
+ if lab in select:
+ for msg in msgs:
+ pm = cmp.parsedMessages[cmp.specimens.messagePool[msg[0].message]]
+ for fn in pm.getFieldNames():
+ fields[lab][fn].update(pm.getValuesByName(fn))
+ # find fields that have a single value within the cluster
+ single = {lab: {fn: next(iter(val)) for fn, val in fnval.items() if len(val) == 1}
+ for lab, fnval in fields.items()}
+ return fields, single
+
+def discriminators(single, selectcount):
+ # find singular fields (e.g., from mifsingle) in other clusters that have different singular values there: discriminator
+ elgnis = defaultdict(dict)
+ for lab, fnval in single.items():
+ for fn, val in fnval.items():
+ elgnis[fn][lab] = val
+ return elgnis, {fn: {lab: val for lab, val in labval.items()} for fn, labval in elgnis.items()
+ if selectcount * 0.5 == len(set(labval.values()))} # to allow for overspecific clusters (above groundtruth types)
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# # # END : Evaluation helpers # # # # # # # # # # # # # #
@@ -235,74 +344,71 @@ def epsautoconfeval(epsilon):
parser.add_argument('pcapfilename', help='Filename of the PCAP to load.')
parser.add_argument('-i', '--interactive', help='Open ipython prompt after finishing the analysis.',
action="store_true")
- parser.add_argument('-t', '--tokenizer', help='Select the tokenizer for this analysis run.', default="tshark")
+ parser.add_argument('-l', '--layer', type=int, default=2,
+ help='Protocol layer relative to IP to consider. Default is 2 layers above IP '
+ '(typically the payload of a transport protocol).')
+ parser.add_argument('-r', '--relativeToIP', default=False, action='store_true')
+ parser.add_argument('-t', '--tokenizer', help='Select the tokenizer for this analysis run.',
+ choices=tokenizers, default="tshark")
parser.add_argument('-s', '--sigma', type=float, help='Only NEMESYS: sigma for noise reduction (gauss filter),'
'default: 0.9')
- parser.add_argument('-r', '--refinement', help='Select segment refinement method.', choices=refinementMethods,
- default="base")
- parser.add_argument('--split', help='Use old split-clusters implementation.',
- action="store_true")
+ parser.add_argument('-f', '--refinement', help='Select segment refinement method.', choices=refinementMethods,
+ default=refinementMethods[-1])
parser.add_argument('-p', '--with-plots',
help='Generate plots.',
action="store_true")
args = parser.parse_args()
- withplots = args.with_plots
-
- print("\n\n")
-
- if not isfile(args.pcapfilename):
- print('File not found: ' + args.pcapfilename)
- exit(1)
- pcapbasename = basename(args.pcapfilename)
+ filechecker = StartupFilecheck(args.pcapfilename)
+ withplots = args.with_plots
analyzerType = analyses[analysis_method]
analysisArgs = None
analysisTitle = analysis_method
-
- if args.tokenizer in tokenizers:
- tokenizer = args.tokenizer
- else:
- print("Unsupported tokenizer:", args.tokenizer, "allowed values are:", tokenizers)
- exit(2)
+ tokenizer = args.tokenizer
# # # # # # # # # # # # # # # # # # # # # # # #
# cache/load the DistanceCalculator to the filesystem
- # # # # # # # # # # # # # # # # # # # # # # # #
- # TODO when manipulating distances, deactivate caching! by adding "True"
- # noinspection PyUnboundLocalVariable
- if args.tokenizer != "nemesys":
- specimens, comparator, segmentedMessages, dc, segmentationTime, dist_calc_segmentsTime = cacheAndLoadDC(
- args.pcapfilename, analysisTitle, tokenizer, debug, analyzerType, analysisArgs, args.sigma,
- refinementCallback=None
- # , disableCache=True
- )
- elif args.refinement == "original":
- specimens, comparator, segmentedMessages, dc, segmentationTime, dist_calc_segmentsTime = cacheAndLoadDC(
- args.pcapfilename, analysisTitle, tokenizer, debug, analyzerType, analysisArgs, args.sigma,
- refinementCallback=originalRefinements
- #, disableCache=True
- )
- elif args.refinement == "base":
- specimens, comparator, segmentedMessages, dc, segmentationTime, dist_calc_segmentsTime = cacheAndLoadDC(
- args.pcapfilename, analysisTitle, tokenizer, debug, analyzerType, analysisArgs, args.sigma,
- refinementCallback=baseRefinements
- #, disableCache=True
- )
- else:
- print("Unknown refinement", args.refinement, "\nAborting")
- exit(2)
- chainedSegments = dc.rawSegments
-
-
+ #
+ fromCache = CachedDistances(args.pcapfilename, analysisTitle, args.layer, args.relativeToIP)
+ # Note! When manipulating distances calculation, deactivate caching by uncommenting the following assignment.
+ # fromCache.disableCache = True
+ fromCache.debug = debug
+ if analysisArgs is not None:
+ # noinspection PyArgumentList
+ fromCache.configureAnalysis(*analysisArgs)
+ fromCache.configureTokenizer(tokenizer, args.sigma)
+
+ if tokenizer[:7] == "nemesys":
+ if args.refinement == "original":
+ fromCache.configureRefinement(originalRefinements)
+ elif args.refinement == "base":
+ fromCache.configureRefinement(baseRefinements)
+ elif args.refinement == "nemetyl":
+ fromCache.configureRefinement(nemetylRefinements)
+ else:
+ print("No refinement selected. Performing raw segmentation.")
+ fromCache.get()
+ segmentedMessages = fromCache.segmentedMessages
+ specimens, comparator, dc = fromCache.specimens, fromCache.comparator, fromCache.dc
+ segmentationTime, dist_calc_segmentsTime = fromCache.segmentationTime, fromCache.dist_calc_segmentsTime
+ # # # # # # # # # # # # # # # # # # # # # # # #
+ # Start the NEMETYL inference process
+ # # # # # # # # # # # # # # # # # # # # # # # #
+ tyl = TypeIdentificationByAlignment(dc, segmentedMessages, tokenizer, specimens.messagePool)
+ # # # # # # # # # # # # # # # # # # # # # # # #
# # # # # # # # # # # # # # # # # # # # # # # #
# if not exists(smcachefn):
- print("Calculate distance for {} messages...".format(len(segmentedMessages)))
- dist_calc_messagesTime = time.time()
- sm = SegmentedMessages(dc, segmentedMessages)
- dist_calc_messagesTime = time.time() - dist_calc_messagesTime
- # smcachefn = 'cache-sm-{}-{}-{}.{}'.format(analysisTitle, tokenparm, pcapName, 'sm')
+ #
+ # # # # # # # # # # # # # # # # # # # # # # # #
+ # Calculate Alignment-Score and CLUSTER messages
+ # # # # # # # # # # # # # # # # # # # # # # # #
+ tyl.clusterMessages()
+ # Prepare basic information about the inference run for the report
+ inferenceParams = TitleBuilder(tokenizer, args.refinement, args.sigma, tyl.clusterer)
+ #
+ # smcachefn = 'cache-sm-{}-{}-{}.{}'.format(analysisTitle, tokenparm, filechecker.pcapstrippedname, 'sm')
# with open(smcachefn, 'wb') as f:
# pickle.dump(sm, f, pickle.HIGHEST_PROTOCOL)
# else:
@@ -312,422 +418,135 @@ def epsautoconfeval(epsilon):
# print('Loading of cached message distances failed.')
# exit(11)
# # # # # # # # # # # # # # # # # # # # # # # #
- cluster_params_autoconfTime = time.time()
- eps, min_samples = sm.autoconfigureDBSCAN()
- cluster_params_autoconfTime = time.time() - cluster_params_autoconfTime
- # # # # # # # # # # # # # # # # # # # # # # # #
+ if withplots:
+ # plot message distances and clusters
+ print("Plot distances...")
+ from nemere.visualization.distancesPlotter import DistancesPlotter
+ dp = DistancesPlotter(specimens, 'message-distances_' + inferenceParams.plotTitle, False)
+ dp.plotManifoldDistances(
+ [specimens.messagePool[seglist[0].message] for seglist in segmentedMessages],
+ tyl.sm.distances, tyl.labels) # segmentedMessages
+ dp.writeOrShowFigure(filechecker.reportFullPath)
+ # # # # # # # # # # # # # # # # # # # # # # # #
# DEBUG and TESTING
# # # # # # # # # # # # # # # # # # # # # # # #
# retrieve manually determined epsilon value
- # epsilon = message_epspertrace[pcapbasename] if pcapbasename in message_epspertrace else 0.15
- if tokenizer == "nemesys":
- eps = eps * .8
+ # epsilon = message_epspertrace[filechecker.pcapbasename]
+ # if filechecker.pcapbasename in message_epspertrace else 0.15
if withplots:
- epsConfirm = epsautoconfeval(eps)
- # DEBUG and TESTING
+ epsConfirm = epsautoconfeval(tyl.eps, tokenizer + f"-s{args.sigma}-{args.refinement}"
+ if tokenizer[:7] == "nemesys" else "")
# # # # # # # # # # # # # # # # # # # # # # # #
-
-
- # # # # # # # # # # # # # # # # # # # # # # # #
- # cluster and align messages and calculate statistics of it
- # # # # # # # # # # # # # # # # # # # # # # # #
- print('Clustering messages...')
- cluster_messagesTime = time.time()
- messageClusters, labels, clusterer = sm.clusterMessageTypesDBSCAN(eps=eps, min_samples=3)
- cluster_messagesTime = time.time() - cluster_messagesTime
- plotTitle = "{}-{} eps {:.3f} ms {}".format(
- tokenizer, type(clusterer).__name__, clusterer.eps, clusterer.min_samples)
- # messageClusters, labels, clusterer = sm.clusterMessageTypesHDBSCAN()
- # plotTitle = "{}-{} mcs {} ms {}".format(
- # tokenizer, type(clusterer).__name__, clusterer.min_cluster_size, clusterer.min_samples)
+ # DEBUG and TESTING
# # # # # # # # # # # # # # # # # # # # # # # #
-
groundtruth = {msg: pm.messagetype for msg, pm in comparator.parsedMessages.items()}
for msg, mtype in groundtruth.items():
msg.messageType = mtype
-
minCsize = numpy.log(len(segmentedMessages))
-
- # TODO test run due to writeCollective(Message)ClusteringStaticstics implementation change!
# # # # # # # # # # # # # # # # # # # # # # # #
# write message clustering statistics to csv
# # # # # # # # # # # # # # # # # # # # # # # #
- clusterStats, conciseness = writeIndividualMessageClusteringStaticstics(
- messageClusters, groundtruth, "{}-{}-eps={:.2f}-min_samples={}".format(
- tokenizer, type(clusterer).__name__, clusterer.eps, clusterer.min_samples), comparator)
+ fullClusterReport = IndividualClusterReport(groundtruth, filechecker)
+ fullClusterReport.write(tyl.messageObjClusters, inferenceParams.dict)
# # # # # # # #
- writeCollectiveMessageClusteringStaticstics(
- messageClusters, groundtruth, "{}-{}-eps={:.2f}-min_samples={}".format(
- tokenizer, type(clusterer).__name__, clusterer.eps, clusterer.min_samples), comparator)
- # # # # # # # # min cluster size
- noisekey = 'Noise' if 'Noise' in messageClusters else -1
- filteredClusters = {k: v for k, v in messageClusters.items() if len(v) >= minCsize}
+ fullCombinReport = CombinatorialClustersReport(groundtruth, filechecker)
+ fullCombinReport.write(tyl.messageObjClusters, inferenceParams.dict)
+
+ # # min cluster size # # # # # #
+ inferenceParams.postProcess = "minCsize"
+ noisekey = 'Noise' if 'Noise' in tyl.messageObjClusters else -1
+ filteredClusters = {k: v for k, v in tyl.messageObjClusters.items() if len(v) >= minCsize}
filteredClusters[noisekey] = list() if not noisekey in filteredClusters else filteredClusters[noisekey].copy()
- filteredClusters[noisekey].extend(s for k, v in messageClusters.items()
+ filteredClusters[noisekey].extend(s for k, v in tyl.messageObjClusters.items()
if len(v) < minCsize for s in v)
- writeCollectiveMessageClusteringStaticstics(
- filteredClusters, groundtruth, "{}-{}-eps={:.2f}-min_samples={}-minCsize".format(
- tokenizer, type(clusterer).__name__, clusterer.eps, clusterer.min_samples), comparator)
+ filteredCombinReport = CombinatorialClustersReport(groundtruth, filechecker)
+ filteredCombinReport.write(filteredClusters, inferenceParams.dict)
+
+
+
+ # # # # # # # # # # # # # # # # # # # # # # # #
+ # ALIGN cluster members
+ # # # # # # # # # # # # # # # # # # # # # # # #
+ tyl.alignClusterMembers()
# # # # # # # # # # # # # # # # # # # # # # # #
+
+ # # # # # # # # # # # # # # # # # # # # # # # #
+ # SPLIT clusters based on fields without rare values
+ # # # # # # # # # # # # # # # # # # # # # # # #
+ inferenceParams.postProcess = tyl.splitClusters( # activateCVSout by the following kwargs
+ runtitle = inferenceParams.dict,
+ trace = filechecker.pcapstrippedname,
+ clusterPrecisions = {cs[0]: cs[2] for cs in fullClusterReport.precisionRecallList if cs is not None})
+ # # # # # # # # # # # # # # # # # # # # # # # #
if withplots:
# plot distances and message clusters
print("Plot distances...")
from nemere.visualization.distancesPlotter import DistancesPlotter
- dp = DistancesPlotter(specimens, 'message-distances-' + plotTitle, False)
+ dp = DistancesPlotter(specimens, 'message-distances_' + inferenceParams.plotTitle, False)
dp.plotManifoldDistances(
[specimens.messagePool[seglist[0].message] for seglist in segmentedMessages],
- sm.distances, labels) # segmentedMessages
- dp.writeOrShowFigure()
-
+ tyl.sm.distances, tyl.labels) # segmentedMessages
+ dp.writeOrShowFigure(filechecker.reportFullPath)
# # # # # # # # # # # # # # # # # # # # # # # #
- # align cluster members
+ # clusterStats for splitter
# # # # # # # # # # # # # # # # # # # # # # # #
- align_messagesTime = time.time()
- alignedClusters = dict()
- alignedClustersHex = dict()
- print("Align each cluster...")
- for clunu, msgcluster in messageClusters.items(): # type: int, List[Tuple[MessageSegment]]
- clusteralignment, alignedsegments = sm.alignMessageType(msgcluster)
- alignedClusters[clunu] = alignedsegments
-
- # get gaps at the corresponding positions
- # print('Cluster', clunu)
- alignedClustersHex[clunu] = [[s.bytes.hex() if s is not None else None for s in m] for m in alignedsegments]
- print()
- align_messagesTime = time.time() - align_messagesTime
+ # write message clustering statistics to csv
# # # # # # # # # # # # # # # # # # # # # # # #
+ splitClusterReport = IndividualClusterReport(groundtruth, filechecker)
+ splitClusterReport.write(tyl.messageObjClusters, inferenceParams.dict)
+ # # # # # # # #
+ splitCombinReport = CombinatorialClustersReport(groundtruth, filechecker)
+ splitCombinReport.write(tyl.messageObjClusters, inferenceParams.dict)
-
-
-
-
-
-
+ # # # # # # # # min cluster size
+ inferenceParams.postProcess += "-minCsize"
+ noisekey = 'Noise' if 'Noise' in tyl.messageObjClusters else -1
+ filteredClusters = {k: v for k, v in tyl.messageObjClusters.items()
+ if len(v) >= minCsize }
+ filteredClusters[noisekey] = list() if not noisekey in filteredClusters else filteredClusters[noisekey].copy()
+ filteredClusters[noisekey].extend(s for k, v in tyl.messageObjClusters.items()
+ if len(v) < minCsize for s in v)
+ filteredSplitCombinReport = CombinatorialClustersReport(groundtruth, filechecker)
+ filteredSplitCombinReport.write(filteredClusters, inferenceParams.dict)
+ # # # # # # # # # # # # # # # # # # # # # # # #
# # # # # # # # # # # # # # # # # # # # # # # #
- # split clusters based on fields without rare values
+ # Check for cluster MERGE candidates
# # # # # # # # # # # # # # # # # # # # # # # #
- if not args.split:
- from nemere.alignment.clusterSplitting import *
-
- cSplitter = RelaxedExoticClusterSplitter(6 if not tokenizer == "tshark" else 3,
- alignedClusters, messageClusters, sm)
- cSplitter.activateCVSout("{}-{}-eps={:.2f}-min_samples={}".format(
- tokenizer, type(clusterer).__name__, clusterer.eps, clusterer.min_samples),
- comparator.specimens.pcapFileName, {cs[0]: cs[2] for cs in clusterStats if cs is not None})
- # in-place split of clusters in alignedClusters and messageClusters
- cSplitter.split()
- labels = cSplitter.labels
-
- # # # # # # # # # # # # # # # # # # # # # # # #
- if withplots:
- # plot distances and message clusters
- print("Plot distances...")
- from nemere.visualization.distancesPlotter import DistancesPlotter
-
- dp = DistancesPlotter(specimens, 'message-distances-' + plotTitle + '-split', False)
- dp.plotManifoldDistances(
- [specimens.messagePool[seglist[0].message] for seglist in segmentedMessages],
- sm.distances, labels) # segmentedMessages
- dp.writeOrShowFigure()
-
- # # # # # # # # # # # # # # # # # # # # # # # #
- # clusterStats for merger
- # # # # # # # # # # # # # # # # # # # # # # # #
- # write message clustering statistics to csv
- # # # # # # # # # # # # # # # # # # # # # # # #
- clusterStats, conciseness = writeIndividualMessageClusteringStaticstics(
- messageClusters, groundtruth, "{}-{}-eps={:.2f}-min_samples={}-split".format(
- tokenizer, type(clusterer).__name__, clusterer.eps, clusterer.min_samples), comparator)
- # # # # # # # #
- writeCollectiveMessageClusteringStaticstics(
- messageClusters, groundtruth, "{}-{}-eps={:.2f}-min_samples={}-split".format(
- tokenizer, type(clusterer).__name__, clusterer.eps, clusterer.min_samples), comparator)
- # # # # # # # # min cluster size
-
- noisekey = 'Noise' if 'Noise' in messageClusters else -1
- filteredClusters = {k: v for k, v in messageClusters.items()
- if len(v) >= minCsize }
- filteredClusters[noisekey] = list() if not noisekey in filteredClusters else filteredClusters[noisekey].copy()
- filteredClusters[noisekey].extend(s for k, v in messageClusters.items()
- if len(v) < minCsize for s in v)
- writeCollectiveMessageClusteringStaticstics(
- filteredClusters, groundtruth, "{}-{}-eps={:.2f}-min_samples={}-split-minCsize".format(
- tokenizer, type(clusterer).__name__, clusterer.eps, clusterer.min_samples), comparator)
- # # # # # # # # # # # # # # # # # # # # # # # #
+ inferenceParams.postProcess = tyl.mergeClusters()
+ # # # # # # # # # # # # # # # # # # # # # # # #
+ # clusterStats for merger
+ # # # # # # # # # # # # # # # # # # # # # # # #
+ # write message clustering statistics to csv
+ # # # # # # # # # # # # # # # # # # # # # # # #
+ mergedClusterReport = IndividualClusterReport(groundtruth, filechecker)
+ mergedClusterReport.write(tyl.messageObjClusters, inferenceParams.dict)
+ # # # # # # # #
+ mergedCombinReport = CombinatorialClustersReport(groundtruth, filechecker)
+ mergedCombinReport.write(tyl.messageObjClusters, inferenceParams.dict)
+ # # # # # # # # min cluster size
+ inferenceParams.postProcess += "-minCsize"
+ noisekey = 'Noise' if 'Noise' in tyl.messageObjClusters else -1
+ filteredMerged = {k: v for k, v in tyl.messageObjClusters.items() if len(v) >= minCsize}
+ filteredMerged[noisekey] = list() if not noisekey in filteredMerged else filteredMerged[noisekey].copy()
+ filteredMerged[noisekey].extend(s for k, v in tyl.messageObjClusters.items()
+ if len(v) < minCsize for s in v)
+ filteredSplitCombinReport = CombinatorialClustersReport(groundtruth, filechecker)
+ filteredSplitCombinReport.write(filteredClusters, inferenceParams.dict)
- # # # # # # # # # # # # # # # # # # # # # # # #
- else:
- # old implementation
- from collections import Counter
-
- exoticValueStats = "reports/exotic-values-statistics.csv"
- fieldLenThresh = 6 if not tokenizer == "tshark" else 3
-
- clusterReplaceMap = dict()
- for aNum, aClu in alignedClusters.items():
- if aNum == -1:
- continue
- cPrec = next(cs[2] for cs in clusterStats if cs is not None and cs[0] == aNum)
- freqThresh = numpy.floor(numpy.log(len(aClu))) # numpy.round(numpy.log(len(aClu)))
- fields = [fld for fld in zip(*aClu)] # type: List[List[MessageSegment]]
- distinctVals4fields = [{tuple(val.values) for val in fld if val is not None} for fld in fields]
- # amount of distinct values per field
- valAmount4fields = [len(valSet) for valSet in distinctVals4fields]
-
- print("\nCluster", aNum, "of size", len(aClu),
- "- threshold", numpy.log(len(aClu)))
- print("Cluster should", "" if cPrec < 1 else "not", "be split. Precision is", cPrec)
-
- valCounts4fields = {fidx: Counter(tuple(seg.values) for seg in segs if seg is not None)
- for fidx, segs in enumerate(fields)} # type: Dict[int, Counter]
- pivotFieldIds = [fidx for fidx, vCnt in enumerate(valAmount4fields)
- if 1 < vCnt <= freqThresh # knee
- and len([True for val in fields[fidx] if val is None]) <= freqThresh # omit fields that have many gaps
- and not any(val.length > fieldLenThresh for val in fields[fidx] if val is not None) # omit fields longer than 3/4
- and not any(set(val.values) == {0} for val in fields[fidx] if val is not None) # omit fields that have zeros
- and not any(cnt <= freqThresh
- for cnt in valCounts4fields[fidx].values())] # remove fields with exotic values
-
- preExotic = [fidx for fidx, vCnt in enumerate(valAmount4fields)
- if 1 < vCnt <= freqThresh # knee
- and len([True for val in fields[fidx] if val is None]) <= freqThresh # omit fields that have many gaps
- and not any([val.length > fieldLenThresh for val in fields[fidx] if val is not None]) # omit fields longer than 3/4
- and not any([set(val.values) == {0} for val in fields[fidx] if val is not None]) # omit fields that have zeros
- ]
-
- for fidx in preExotic:
- scnt = sorted(valCounts4fields[fidx].values())
- diffmax = (numpy.diff(scnt).argmax()+1) if len(scnt) > 1 else "-"
- csvWriteHead = False if exists(exoticValueStats) else True
- with open(exoticValueStats, 'a') as csvfile:
- exoticcsv = csv.writer(csvfile) # type: csv.writer
- if csvWriteHead:
- exoticcsv.writerow([
- 'run_title', 'trace', 'cluster_label', 'precision', 'cluster_size', 'field',
- 'num_vals',
- 'maxdiff_n', 'maxdiff_v', 'sum=n', 'mean=n',
- 'stdev=n', 'median=n'
- ])
- fieldParameters = [ "{}-{}-eps={:.2f}-min_samples={}".format(
- tokenizer, type(clusterer).__name__, clusterer.eps, clusterer.min_samples),
- comparator.specimens.pcapFileName,
- aNum, cPrec, len(aClu), fidx, len(scnt)]
- if len(scnt) > 1:
- exoticcsv.writerow([
- *fieldParameters, diffmax, scnt[diffmax],
- sum(scnt[:diffmax]), sum(scnt[diffmax:]),
- numpy.mean(scnt[:diffmax]), numpy.mean(scnt[diffmax:]),
- numpy.std(scnt[:diffmax]), numpy.std(scnt[diffmax:]),
- numpy.median(scnt[:diffmax]), numpy.median(scnt[diffmax:])
- ])
- else:
- exoticcsv.writerow(fieldParameters + [""] * 10)
-
-
-
- newExotic = list()
- for fidx in preExotic:
- scnt = sorted(valCounts4fields[fidx].values())
- if len(scnt) > 1:
- if scnt[0] > freqThresh >= len(scnt):
- newExotic.append(fidx)
- continue
-
- # the pivot index and value to split the sorted list of type amounts
- # iVal, pVal = next((i, cnt) for i, cnt in enumerate(scnt) if cnt > freqThresh)
- iVal = numpy.diff(scnt).argmax() + 1
- # the special case of any(cnt <= freqThresh for cnt in scnt) is relaxedly included here
- numValues_u = len(scnt) - iVal
- # if there are no or only one frequent value, do not split
- if numValues_u > 1:
- pVal = scnt[iVal]
- mean_u = numpy.mean(scnt[iVal:])
- halfsRatio = sum(scnt[:iVal]) / sum(scnt[iVal:])
- if halfsRatio < 0.1 and mean_u > 2 * len(aClu) / numpy.log(len(aClu)):
- newExotic.append(fidx)
-
-
- addExotics = set(newExotic) - set(pivotFieldIds)
- remExotics = set(pivotFieldIds) - set(newExotic)
- if not len(addExotics) == len(remExotics) == 0:
- print("pivot field changes due to new exotics:")
- print(" add fields:", addExotics)
- print(" remove fields:", remExotics)
- # print info only if we do not split
- if len(newExotic) == 0:
- print("no pivot fields left")
- # exoticCondition = [(fidx, any(cnt <= freqThresh for cnt in valCounts4fields[fidx].values()))
- # for fidx, vCnt in enumerate(valAmount4fields)]
- continue # conditions not met for splitting: next cluster
- elif len(newExotic) > 2:
- print("too many pivot fields:", len(newExotic))
- continue # conditions not met for splitting: next cluster
-
-
- print(newExotic)
- # split clusters
- clusterSplits = dict() # type: Dict[Union[Tuple, None], List[Tuple[MessageSegment]]]
- for msgsegs in aClu:
- # concatenate multiple distinct field combinations
- pivotVals = tuple([(pId, *msgsegs[pId].values) if msgsegs[pId] is not None else None
- for pId in newExotic])
- if pivotVals not in clusterSplits:
- clusterSplits[pivotVals] = list()
- clusterSplits[pivotVals].append(msgsegs)
- clusterReplaceMap[aNum] = clusterSplits
- print("replace cluster", aNum, "by")
- print(tabulate((clusterSplits.keys())))
-
-
- # replace clusters by their splits
- for aNum, clusterSplits in clusterReplaceMap.items():
- for nci, cluSpl in enumerate(clusterSplits.values()): # type: int, List[Tuple[MessageSegment]]
- # newCluLabel = (aNum+1) * 100 + nci
- newCluLabel = "{}s{}".format(aNum, nci)
-
- msgs = [next(seg for seg in msgsegs if seg is not None).message for msgsegs in cluSpl]
- messageClusters[newCluLabel] = [msgsegs for msgsegs in messageClusters[aNum]
- if msgsegs[0].message in msgs]
-
- clusteralignment, alignedsegments = sm.alignMessageType(messageClusters[newCluLabel])
- alignedClusters[newCluLabel] = alignedsegments
-
- del alignedClusters[aNum]
- del messageClusters[aNum]
-
- # labels for distance plot
- msgLabelMap = {tuple(msgsegs): clunu for clunu, msgs in messageClusters.items() for msgsegs in msgs}
- labels = numpy.array([msgLabelMap[tuple(seglist)] for seglist in segmentedMessages])
-
- # # # # # # # # # # # # # # # # # # # # # # # #
- if withplots:
- # plot distances and message clusters
- print("Plot distances...")
- from nemere.visualization.distancesPlotter import DistancesPlotter
-
- dp = DistancesPlotter(specimens, 'message-distances-' + plotTitle + '-split', False)
- dp.plotManifoldDistances(
- [specimens.messagePool[seglist[0].message] for seglist in segmentedMessages],
- sm.distances, labels) # segmentedMessages
- dp.writeOrShowFigure()
-
- # # # # # # # # # # # # # # # # # # # # # # # #
- # clusterStats for merger
- # # # # # # # # # # # # # # # # # # # # # # # #
- # write message clustering statistics to csv
- # # # # # # # # # # # # # # # # # # # # # # # #
- clusterStats, conciseness = writeIndividualMessageClusteringStaticstics(
- messageClusters, groundtruth, "{}-{}-eps={:.2f}-min_samples={}-split".format(
- tokenizer, type(clusterer).__name__, clusterer.eps, clusterer.min_samples), comparator)
- # # # # # # # #
- writeCollectiveMessageClusteringStaticstics(
- messageClusters, groundtruth, "{}-{}-eps={:.2f}-min_samples={}-split".format(
- tokenizer, type(clusterer).__name__, clusterer.eps, clusterer.min_samples), comparator)
- # # # # # # # # min cluster size
-
- noisekey = 'Noise' if 'Noise' in messageClusters else -1
- filteredClusters = {k: v for k, v in messageClusters.items()
- if len(v) >= minCsize }
- filteredClusters[noisekey] = list() if not noisekey in filteredClusters else filteredClusters[noisekey].copy()
- filteredClusters[noisekey].extend(s for k, v in messageClusters.items()
- if len(v) < minCsize for s in v)
- writeCollectiveMessageClusteringStaticstics(
- filteredClusters, groundtruth, "{}-{}-eps={:.2f}-min_samples={}-split-minCsize".format(
- tokenizer, type(clusterer).__name__, clusterer.eps, clusterer.min_samples), comparator)
- # # # # # # # # # # # # # # # # # # # # # # # #
-
-
-
-
-
-
- # # # # # # # # # # # # # # # # # # # # # # # #
- # check for cluster merge candidates
- # # # # # # # # # # # # # # # # # # # # # # # #
- # TODO fully integrate into/encapsulate in ClusterMerger class
- print("Check for cluster merge candidates...")
- # noinspection PyUnreachableCode
- if True:
- # ClusterMerger
- clustermerger = ClusterMerger(alignedClusters, dc)
-
- alignedFieldClasses = clustermerger.alignFieldClasses((0, -1, 5)) # TODO alt1
- # alignedFieldClasses = clustermerger.alignFieldClasses((0, -5, 5)) # TODO alt2
- if tokenizer == "nemesys":
- alignedFieldClasses = clustermerger.gapMerging4nemesys(alignedFieldClasses)
- matchingConditions = clustermerger.generateMatchingConditions(alignedFieldClasses)
- matchingClusters = ClusterMerger.selectMatchingClusters(alignedFieldClasses, matchingConditions)
- mergedClusters = clustermerger.mergeClusters(
- messageClusters, clusterStats, alignedFieldClasses, matchingClusters, matchingConditions)
- mergedClusterStats, mergedConciseness = writeIndividualMessageClusteringStaticstics(
- mergedClusters, groundtruth,
- "merged-{}-{}-eps={:.2f}-min_samples={}".format(
- tokenizer, type(clusterer).__name__, clusterer.eps, clusterer.min_samples),
- comparator)
- # # # # # # # #
- writeCollectiveMessageClusteringStaticstics(
- mergedClusters, groundtruth,
- "merged-{}-{}-eps={:.2f}-min_samples={}".format(
- tokenizer, type(clusterer).__name__, clusterer.eps, clusterer.min_samples),
- comparator)
- # # # # # # # # min cluster size
- noisekey = 'Noise' if 'Noise' in mergedClusters else -1
- filteredMerged = {k: v for k, v in mergedClusters.items() if len(v) >= minCsize}
- filteredMerged[noisekey] = list() if not noisekey in filteredMerged else filteredMerged[noisekey].copy()
- filteredMerged[noisekey].extend(s for k, v in mergedClusters.items()
- if len(v) < minCsize for s in v)
- writeCollectiveMessageClusteringStaticstics(
- filteredMerged, groundtruth,
- "merged-{}-{}-eps={:.2f}-min_samples={}-minCsize".format(
- tokenizer, type(clusterer).__name__, clusterer.eps, clusterer.min_samples),
- comparator)
- else:
- # # # # # # # # # # # # # # # # # # # # #
- # alternative idea of clustering clusters: does not improve merging - perhaps the similarity matrix is not good enough?!
- # ClusterClusterer
- clusterclusterer = ClusterClusterer(alignedClusters, dc)
- # clusterDists = clusterclusterer.calcClusterDistances()
-
- mergeEps, mergeMpts = clusterclusterer.autoconfigureDBSCAN()
-
- clusterClusters, labels, mergeclusterer = clusterclusterer.clusterMessageTypesDBSCAN(mergeEps, min_samples=2)
- clusterClustersNoiseless = {k: v for k, v in clusterClusters.items() if k > -1}
- mergedClusters = ClusterClusterer.mergeClusteredClusters(clusterClustersNoiseless, messageClusters)
- ClusterClusterer.printShouldMerge(list(clusterClustersNoiseless.values()), clusterStats)
-
- mergedClusterStats, mergedConciseness = writeMessageClusteringStaticstics(
- mergedClusters, groundtruth,
- "merged-{}-{}-eps={:.2f}-min_samples={}".format(
- tokenizer, type(mergeclusterer).__name__, mergeclusterer.eps, mergeclusterer.min_samples),
- comparator)
-
- from netzob.Model.Vocabulary.Messages.RawMessage import RawMessage
- from nemere.visualization.distancesPlotter import DistancesPlotter
- typedClusterDummys = list()
- for clunu in clusterclusterer.clusterOrder:
- clusta = None
- for stats in clusterStats:
- if stats is not None and stats[0] == clunu:
- clusta = stats[1] if stats[2] == 1.0 else "({})".format(stats[1])
- break
- msgdum = RawMessage(messageType=clusta)
- typedClusterDummys.append(msgdum)
-
- dp = DistancesPlotter(specimens, "cluster-clustering-" + plotTitle, False)
- dp.plotManifoldDistances(typedClusterDummys, clusterclusterer.distances, labels)
- dp.writeOrShowFigure()
+ # # Alternative approach to ClusterMerger, discarded.
+ # clusterClusters()
# # overwrite existing variables
# # # # # # # # # # # # # # # # # # # # # # # #
@@ -747,7 +566,7 @@ def epsautoconfeval(epsilon):
# # labels for distance plot
# msgLabelMap = {tuple(msgsegs): clunu for clunu, msgs in messageClusters.items() for msgsegs in msgs}
# labels = numpy.array([msgLabelMap[tuple(seglist)] for seglist in segmentedMessages])
-
+ #
# END # of # check for cluster merge candidates #
# # # # # # # # # # # # # # # # # # # # # # # #
@@ -757,48 +576,45 @@ def epsautoconfeval(epsilon):
-
-
-
-
-
-
-
+ writePerformanceStatistics(
+ specimens, tyl.clusterer, inferenceParams.plotTitle,
+ segmentationTime, dist_calc_segmentsTime,
+ tyl.dist_calc_messagesTime, tyl.cluster_params_autoconfTime, tyl.cluster_messagesTime, tyl.align_messagesTime
+ )
+ filechecker.writeReportMetadata(fromCache.dccachefn if fromCache.isLoaded else None)
# # # # # # # # # # # # # # # # # # # # # # # #
# write alignments to csv
# # # # # # # # # # # # # # # # # # # # # # # #
- reportFolder = "reports"
- pcapName = splitext(pcapbasename)[0]
- fileNameS = "NEMETYL-symbols-" + plotTitle + "-" + pcapName
- csvpath = join(reportFolder, fileNameS + '.csv')
+ # TODO split clusters are internally re-aligned, but NOT merged clusters. Can this lead to an inconsistency?
+ csvpath = join(filechecker.reportFullPath,
+ f"NEMETYL-symbols-{inferenceParams.plotTitle}-{filechecker.pcapstrippedname}.csv")
if not exists(csvpath):
print('Write alignments to {}...'.format(csvpath))
with open(csvpath, 'w') as csvfile:
symbolcsv = csv.writer(csvfile)
- for clunu, clusg in alignedClusters.items():
- symbolcsv.writerow(["# Cluster", clunu, "- Fields -", "- Alignment -"])
+ symbolcsv.writerow(["Cluster", "Type", "frame.time_epoch", "Field", "Alignment"])
+ for clunu, clusg in tyl.alignedClusters.items():
symbolcsv.writerows(
- [groundtruth[comparator.messages[next(seg for seg in msg if seg is not None).message]]]
+ [clunu, # cluster label
+ groundtruth[comparator.messages[next(seg for seg in msg if seg is not None).message]], # message type string from gt
+ next(seg for seg in msg if seg is not None).message.date] # frame.time_epoch
+ [sg.bytes.hex() if sg is not None else '' for sg in msg] for msg in clusg
)
- symbolcsv.writerow(["---"] * 5)
else:
print("Symbols not saved. File {} already exists.".format(csvpath))
if not args.interactive:
IPython.embed()
# # # # # # # # # # # # # # # # # # # # # # # #
- writePerformanceStatistics(
- specimens, clusterer,
- "{} {} {}".format(tokenizer, analysis_method, distance_method),
- segmentationTime, dist_calc_segmentsTime, dist_calc_messagesTime,
- cluster_params_autoconfTime, cluster_messagesTime, align_messagesTime
- )
+
+
if args.interactive:
+ import numpy
from tabulate import tabulate
+
# globals().update(locals())
IPython.embed()
diff --git a/src/netzob_fms.py b/src/netzob_fms.py
index 8315505d..29e7ecd9 100644
--- a/src/netzob_fms.py
+++ b/src/netzob_fms.py
@@ -36,6 +36,19 @@
minThresh = 75
maxThresh = 75
+
+# optimal similarity threshold for some evaluation traces (from -100s):
+optThresh = {
+ "dhcp_SMIA2011101X-filtered_maxdiff-" : 76,
+ "dns_ictf2010_maxdiff-" : 51,
+ "dns_ictf2010-new_maxdiff-" : 50,
+ "nbns_SMIA20111010-one_maxdiff-" : 53,
+ "ntp_SMIA-20111010_maxdiff-" : 66,
+ "smb_SMIA20111010-one-rigid1_maxdiff-" : 53,
+}
+
+
+
def getNetzobInference(l5msgs: List[AbstractMessage], minEquivalence=45):
"""
Imports the application layer messages from a PCAP and applies Format.clusterByAlignment() to them.
@@ -108,10 +121,10 @@ def findFormatExamples(theshSymbols):
formatsInSymbols[thresh] = dict()
for symb, rest in symbqual.items():
formatsInSymbols[thresh][symb] = list()
- for msg in symb.messages: # the L5Messages
+ for l5msg in symb.messages: # the L5Messages
# only retain unique formats (a list of tuples of primitives can simply be compared)
- if tformats[specimens.messagePool[msg]] not in (fmt for fmt, msg in formatsInSymbols[thresh][symb]):
- formatsInSymbols[thresh][symb].append((tformats[specimens.messagePool[msg]], msg)) # the format for message msg
+ if tformats[specimens.messagePool[l5msg]] not in (fmt for fmt, msg in formatsInSymbols[thresh][symb]):
+ formatsInSymbols[thresh][symb].append((tformats[specimens.messagePool[l5msg]], l5msg)) # the format for message msg
def reduceBitsToBytes(formatdescbit):
@@ -147,6 +160,9 @@ def reduceBitsToBytes(formatdescbit):
parser.add_argument('--smax', type=int, help='maximum similarity threshold to iterate. Omit to only infer at the threshold of smin')
parser.add_argument('-p', '--profile', help='profile the netzob run.',
action="store_true")
+ parser.add_argument('-i', '--interactive', help='Show interactive plot instead of writing output to file and '
+ 'open ipython prompt after finishing the analysis.',
+ action="store_true")
parser.add_argument('-l', '--layer', type=int, default=2,
help='Protocol layer relative to IP to consider. Default is 2 layers above IP '
'(typically the payload of a transport protocol).')
@@ -165,11 +181,17 @@ def reduceBitsToBytes(formatdescbit):
failOnUndissectable=False, debug=debug)
print('Loaded and dissected in {:.3f}s'.format(time.time() - swstart))
- print('\nNetzob Inference ...')
+ print(f'\nNetzob Inference of {specimens.pcapFileName}...')
# dict ( similaritythreshold : dict ( symbol : (quality, fieldcount, exactf, nearf, uospecific) ) )
if args.smin:
minThresh = args.smin
maxThresh = args.smax if args.smax else args.smin
+ else:
+ # use optimum for trace if a value is known
+ for pcap, simthr in optThresh.items():
+ if pcap in specimens.pcapFileName:
+ minThresh = maxThresh = simthr
+ break
threshSymbTfmtTime = iterSimilarities(minThresh, maxThresh)
threshSymbTfmt = {t: s for t, (s, r) in threshSymbTfmtTime.items()}
threshTime = {t: r for t, (s, r) in threshSymbTfmtTime.items()}
@@ -212,8 +234,10 @@ def reduceBitsToBytes(formatdescbit):
qualityperformat[df] = list() # FMS per format
qpfSimilarity[df] = list()
for (thresh, msg), metrics in formatmatchmetrics.items(): # per threshold
- qualityperformat[metrics.trueFormat].append(metrics.score)
- qpfSimilarity[metrics.trueFormat].append(thresh)
+ # ignore parsing errors
+ if metrics.score is not None:
+ qualityperformat[metrics.trueFormat].append(metrics.score)
+ qpfSimilarity[metrics.trueFormat].append(thresh)
# TODO biggest/most correct cluster per threshold
# TODO format correctness, consiseness, (coverage) of each symbol
@@ -233,7 +257,7 @@ def reduceBitsToBytes(formatdescbit):
plt.scatter(qpfSimilarity[df], qualityperformat[df], c=xkcdc[i], alpha=0.5, marker=r'.',
label="Format {:d} ".format(i)) # + repr(df))
plt.ticklabel_format(style="plain")
- plt.xlabel("Similarity Theshold")
+ plt.xlabel("Similarity Threshold")
plt.ylabel("Format Match Score")
plt.legend(loc=2)
@@ -249,9 +273,11 @@ def reduceBitsToBytes(formatdescbit):
ParsedMessage.closetshark()
# interactive stuff
- # plt.show()
- print("\nAll truths are easy to understand once they are discovered; the point is to discover them. -- Galileo Galilei\n")
- IPython.embed()
+ if args.interactive:
+ # plt.show()
+ print("\nAll truths are easy to understand once they are discovered; the point is to discover them. "
+ "-- Galileo Galilei\n")
+ IPython.embed()
diff --git a/src/netzob_messagetypes.py b/src/netzob_messagetypes.py
index 9f11c5b5..81686c5c 100644
--- a/src/netzob_messagetypes.py
+++ b/src/netzob_messagetypes.py
@@ -13,16 +13,14 @@
Usenix WOOT 2018.
"""
import argparse
-from os.path import isfile, join
+from os.path import isfile, splitext, basename
from typing import Dict, Tuple, List
from netzob import all as netzob
from netzob.Model.Vocabulary.Messages.AbstractMessage import AbstractMessage
-import nemere.utils.evaluationHelpers as eh
-from nemere.inference.segments import HelperSegment
-from nemere.inference.analyzers import NoneAnalysis
from nemere.utils.loader import SpecimenLoader
+from nemere.utils.reportWriter import IndividualClusterReport, CombinatorialClustersReport
from nemere.validation.messageParser import ParsedMessage
from nemere.validation.dissectorMatcher import MessageComparator
@@ -154,17 +152,17 @@ def iterSimilarities(minSimilarity=40, maxSimilarity=60) \
# compare symbols' messages to true message types
groundtruth = {msg: pm.messagetype for msg, pm in comparator.parsedMessages.items()}
- eh.clStatsFile = join(eh.reportFolder, 'messagetype-netzob-statistics.csv')
- eh.ccStatsFile = join(eh.reportFolder, 'messagetype-combined-netzob-statistics.csv')
+ IndividualClusterReport.messagetypeStatsFile = "messagetype-netzob-statistics"
+ CombinatorialClustersReport.messagetypeStatsFile = "messagetype-combined-netzob-statistics"
for thresh, symbMsgs in threshSymbMsgs.items():
- # place each msg in tuple of one dummy segment
- messageClusters = {symname: [[HelperSegment(NoneAnalysis(msg),0,len(msg.data))] for msg in msglist]
- for symname, msglist in symbMsgs.items()}
-
- clusterStats, conciseness = eh.writeIndividualMessageClusteringStaticstics(
- messageClusters, groundtruth, "netzob-thresh={}".format(thresh), comparator)
- eh.writeCollectiveMessageClusteringStaticstics(
- messageClusters, groundtruth, "netzob-thresh={}".format(thresh), comparator)
+ messageClusters = {symname: [specimens.messagePool[msg] for msg in msglist]
+ for symname, msglist in symbMsgs.items()}
+
+ # TODO replace title by ...inferenceParams
+ clusterReport = IndividualClusterReport(groundtruth, splitext(basename(specimens.pcapFileName))[0])
+ clusterReport.write(messageClusters, "netzob-thresh={}".format(thresh))
+ combinReport = CombinatorialClustersReport(groundtruth, splitext(basename(specimens.pcapFileName))[0])
+ combinReport.write(messageClusters, "netzob-thresh={}".format(thresh))
ParsedMessage.closetshark()
diff --git a/src/prep_deduplicate-trace.py b/src/prep_deduplicate-trace.py
index daf021b4..6b55ce3c 100644
--- a/src/prep_deduplicate-trace.py
+++ b/src/prep_deduplicate-trace.py
@@ -7,8 +7,12 @@
import argparse
from os.path import exists,isfile,splitext
from collections import OrderedDict
+from collections.abc import Sequence
import logging # hide warnings of scapy: https://stackoverflow.com/questions/24812604/hide-scapy-warning-message-ipv6
+
+from scapy.layers.dot11 import RadioTap
+
logging.getLogger("scapy.runtime").setLevel(logging.ERROR)
from scapy.layers.inet import IP
from scapy.layers.l2 import Ether
@@ -50,17 +54,19 @@ def dedup(self, packet):
if isinstance(self.TARGETLAYER, int):
targetpacket = packet[self.TARGETLAYER]
else:
- targetpacket = packet[self.TARGETLAYER][2]
+ targetpacket = packet[self.TARGETLAYER[0]][self.TARGETLAYER[1]]
self.unique_packets[str(targetpacket)] = packet
if len(self.unique_packets) >= PACKET_LIMIT:
return True
except IndexError:
- if isinstance(TARGETLAYER, str):
- layername = TARGETLAYER + ' + 2'
+ if isinstance(self.TARGETLAYER, str):
+ layername = self.TARGETLAYER
+ elif isinstance(self.TARGETLAYER, Sequence):
+ layername = f'{self.TARGETLAYER[0]} + {self.TARGETLAYER[1]}'
else:
- layername = TARGETLAYER
+ layername = self.TARGETLAYER
print('Protocol layer ' + str(layername) + ' not available in the following packet:')
print('\n\n' + repr(packet) + '\n\n')
return False
@@ -71,7 +77,7 @@ def main(filename, outputfile, targetlayer, packetlimit):
# TODO sniff waits indefinitely if the input-pcap file contains less # packets < PACKET_LIMIT; break with ctrl+c
# sniff has the advantage to NOT read the whole file into the memory initially. This saves memory for huge pcaps.
- sniff(offline=filename,stop_filter=dedup.dedup,store=0)
+ sniff(offline=filename, stop_filter=dedup.dedup, store=0)
# get the first packet (we assume all have the same linktype)
eplpkt = next(iter(dedup.unique_packets.values()))
@@ -79,12 +85,15 @@ def main(filename, outputfile, targetlayer, packetlimit):
lt = ParsingConstants.LINKTYPES["ETHERNET"] # 0x1
elif isinstance(eplpkt, IP):
lt = ParsingConstants.LINKTYPES["RAW_IP"] # 0x65
+ elif isinstance(eplpkt, RadioTap):
+ lt = ParsingConstants.LINKTYPES["IEEE802_11_RADIO"] # 0x7f
else:
- raise Exception("Check linktype.")
+ raise Exception("Check linktype ({}).".format(eplpkt.name))
wrpcap(outputfile, dedup.unique_packets.values(), linktype=lt)
print("Deduplication of {:s} of pcap written to {:s}".format(
- str(TARGETLAYER) if not isinstance(TARGETLAYER, str) else TARGETLAYER + ' + 2',
+ targetlayer if isinstance(targetlayer, str) else
+ f"{targetlayer[0]} + {targetlayer[1]}" if isinstance(targetlayer, Sequence) else str(targetlayer),
outfile))
@@ -93,18 +102,20 @@ def main(filename, outputfile, targetlayer, packetlimit):
parser = argparse.ArgumentParser(description=
'Limit number of packets in pcap outfile to fixed number of unique packets.')
parser.add_argument('pcapfilename', help='pcapfilename')
- parser.add_argument('-l', '--layernumber', nargs='?', type= int,
- help='layernumber (default: IP+2)', default='-1')
+ parser.add_argument('-l', '--layer', type=int, default=2,
+ help='Protocol layer relative to IP to consider. Default is 2 layers above IP '
+ '(typically the payload of a transport protocol).')
+ parser.add_argument('-r', '--relativeToIP', default=False, action='store_true')
parser.add_argument('-p', '--packetcount', nargs='?', type= int,
help='packet count (default: {:d})'.format(PACKET_LIMIT), default=PACKET_LIMIT)
args = parser.parse_args()
FILENAME = args.pcapfilename # 'file.pcap'
PACKET_LIMIT = args.packetcount
- if args.layernumber >= 0:
- TARGETLAYER = args.layernumber # use 'IP' as flag for "IP+2"
+ if not args.relativeToIP:
+ TARGETLAYER = (0, args.layer)
else:
- TARGETLAYER = 'IP'
+ TARGETLAYER = ('IP', args.layer)
if not isfile(FILENAME):
print('File not found: ' + FILENAME)
diff --git a/tests/netzob-support.py b/tests/netzob-support.py
new file mode 100644
index 00000000..249a3cd8
--- /dev/null
+++ b/tests/netzob-support.py
@@ -0,0 +1,121 @@
+"""
+Test suite to ensure crucial Netzob functionality to work as expected.
+
+NEMESYS requires that Netzob is at least as recent to have the following commits:
+ * fix hash generation for Symbol + add one for AbstractField (44d899c/df7094a)
+ * fix building of layered messages (57ee01e/9cb7507)
+"""
+import unittest, itertools
+from os import path
+
+import netzob
+from netzob.Import.PCAPImporter.PCAPImporter import PCAPImporter
+from netzob.Model.Vocabulary.Field import Field
+from netzob.Model.Vocabulary.Symbol import Symbol
+
+
+UPGRADE_NOTE = "Upgrade to current version from https://github.com/netzob!"
+
+class TestAbstractField(unittest.TestCase):
+ """
+ Test requirement about hashability of AbstractField
+ """
+ def setUp(self) -> None:
+ """
+ Construct AbstractFields with the same name.
+ Use a subclass implementing AbstractField to instantiate.
+ """
+ self.afs = [Field(name="NEMESYS")]*5
+
+ def test_hash(self):
+ """
+ Check if different objects return different hashes.
+ """
+ for afA, afB in itertools.combinations(self.afs, 2):
+ if afA == afB:
+ continue
+ # print(id(afA), id(afB))
+ assert hash(afA) != hash(afB), "Netzob version without fixed AbstractField hash generation. " + \
+ UPGRADE_NOTE
+
+class TestSymbol(unittest.TestCase):
+ """
+ Test requirement about hashability of Symbol
+ """
+ def setUp(self) -> None:
+ """
+ Construct Symbols with the same name.
+ """
+ self.symbols = [Symbol(fields=[], messages=[], name="NEMESYS")]*5
+
+ def test_hash(self):
+ """
+ Check if different objects return different hashes.
+ """
+ for symA, symB in itertools.combinations(self.symbols, 2):
+ if symA == symB:
+ continue
+ assert hash(symA) != hash(symB), "Netzob version without fixed Symbol hash generation. " + \
+ UPGRADE_NOTE
+
+class TestPCAPImporter(unittest.TestCase):
+ """
+ Test import from PCAP file on different layers. Ensure the correct bytes are considered payload.
+ """
+ # TESTPCAP = "resources/pcaps/test_import_udp.pcap"
+ # """relative to the netzob repository folder 'test'"""
+ TESTPCAP = "resources/test_import_udp_courtesy2NetzobTeam.pcap"
+
+ UPGRADE_NOTE = "Netzob version with faulty PCAP import. "
+
+ def setUp(self) -> None:
+ # modpath = path.dirname(netzob.__file__)
+ # testpath = path.join(modpath, "../../test")
+ # self.pcappath = path.join(testpath, TestPCAPImporter.TESTPCAP)
+ self.pcappath = TestPCAPImporter.TESTPCAP
+
+ def test_layer1(self):
+ """
+ Raw frame (whole frame is "payload")
+ """
+ messages = PCAPImporter.readFile(self.pcappath, importLayer=1).values()
+ assert messages[0].data == b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08\x00' \
+ b'E\x00\x003\xdc\x11@\x00@\x11`\xa6\x7f\x00\x00\x01\x7f\x00\x00\x01' \
+ b'\xe1\xe7\x10\x92\x00\x1f\xfe2' \
+ b'CMDidentify#\x07\x00\x00\x00Roberto', \
+ TestPCAPImporter.UPGRADE_NOTE + UPGRADE_NOTE
+
+ def test_layer2(self):
+ """
+ Layer 2, e. g. parse Ethernet frame (IP packet is payload)
+ """
+ messages = PCAPImporter.readFile(self.pcappath, importLayer=2).values()
+ assert messages[0].data == b'E\x00\x003\xdc\x11@\x00@\x11`\xa6\x7f\x00\x00\x01\x7f\x00\x00\x01' \
+ b'\xe1\xe7\x10\x92\x00\x1f\xfe2' \
+ b'CMDidentify#\x07\x00\x00\x00Roberto', \
+ TestPCAPImporter.UPGRADE_NOTE + UPGRADE_NOTE
+
+ def test_layer3(self):
+ """
+ Layer 3, e. g. parse IP packet (UDP datagram is payload)
+ """
+ messages = PCAPImporter.readFile(self.pcappath, importLayer=3).values()
+ assert messages[0].data == b'\xe1\xe7\x10\x92\x00\x1f\xfe2' \
+ b'CMDidentify#\x07\x00\x00\x00Roberto', \
+ TestPCAPImporter.UPGRADE_NOTE + UPGRADE_NOTE
+
+ def test_layer4plus(self):
+ """
+ Layer 4, e. g. parse UDP packet (application protocol is payload)
+ Layer > 4, does decode like layer=4
+ """
+ messages4 = PCAPImporter.readFile(self.pcappath, importLayer=4).values()
+ messages5 = PCAPImporter.readFile(self.pcappath, importLayer=5).values()
+ assert messages4[0].data == messages5[0].data == b'CMDidentify#\x07\x00\x00\x00Roberto', \
+ TestPCAPImporter.UPGRADE_NOTE + UPGRADE_NOTE
+
+
+
+
+if "__main__" == __name__:
+ unittest.main()
\ No newline at end of file
diff --git a/tests/resources/test_import_udp_courtesy2NetzobTeam.pcap b/tests/resources/test_import_udp_courtesy2NetzobTeam.pcap
new file mode 100644
index 00000000..db5a310d
Binary files /dev/null and b/tests/resources/test_import_udp_courtesy2NetzobTeam.pcap differ