diff --git a/pdf-parser.py b/pdf-parser.py index 687cc44..5aa3dfd 100644 --- a/pdf-parser.py +++ b/pdf-parser.py @@ -2,10 +2,10 @@ __description__ = 'pdf-parser, use it to parse a PDF document' __author__ = 'Didier Stevens' -__version__ = '0.6.7' -__date__ = '2016/12/17' +__version__ = '0.6.8' +__date__ = '2017/10/29' __minimum_python_version__ = (2, 5, 1) -__maximum_python_version__ = (3, 4, 3) +__maximum_python_version__ = (3, 6, 3) """ Source code put in public domain by Didier Stevens, no Copyright @@ -58,6 +58,9 @@ 2016/07/27: V0.6.5 bugfix whitespace 0x00 0x0C after stream 0x0D 0x0A reported by @mr_me 2016/11/20: V0.6.6 added workaround zlib errors FlateDecode 2016/12/17: V0.6.7 added option -k + 2017/01/07: V0.6.8 changed cPDFParseDictionary to handle strings () with % character + 2017/10/28: fixed bug + 2017/10/29: added # support for option -y Todo: - handle printf todo @@ -146,7 +149,9 @@ def Obj2Str(content): class cPDFDocument: def __init__(self, file): self.file = file - if file.lower().startswith('http://') or file.lower().startswith('https://'): + if type(file) != str: + self.infile = file + elif file.lower().startswith('http://') or file.lower().startswith('https://'): try: if sys.hexversion >= 0x020601F0: self.infile = urllib23.urlopen(file, timeout=5) @@ -275,6 +280,14 @@ def TokenIgnoreWhiteSpace(self): token = self.Token() return token + def Tokens(self): + tokens = [] + token = self.Token() + while token != None: + tokens.append(token) + token = self.Token() + return tokens + def unget(self, byte): self.ungetted.append(byte) @@ -643,7 +656,7 @@ def __init__(self, content, nocanonicalizedoutput): dataTrimmed = TrimLWhiteSpace(TrimRWhiteSpace(self.content)) if dataTrimmed == []: self.parsed = None - elif self.isOpenDictionary(dataTrimmed[0]) and self.isCloseDictionary(dataTrimmed[-1]): + elif self.isOpenDictionary(dataTrimmed[0]) and (self.isCloseDictionary(dataTrimmed[-1]) or self.couldBeCloseDictionary(dataTrimmed[-1])): self.parsed = self.ParseDictionary(dataTrimmed)[0] else: self.parsed = None @@ -654,6 +667,9 @@ def isOpenDictionary(self, token): def isCloseDictionary(self, token): return token[0] == CHAR_DELIMITER and token[1] == '>>' + def couldBeCloseDictionary(self, token): + return token[0] == CHAR_DELIMITER and token[1].rstrip().endswith('>>') + def ParseDictionary(self, tokens): state = 0 # start dictionary = [] @@ -694,7 +710,11 @@ def ParseDictionary(self, tokens): elif value == [] and tokens[0][1] == '(': value.append(tokens[0][1]) elif value != [] and value[0] == '(' and tokens[0][1] != ')': - value.append(tokens[0][1]) + if tokens[0][1][0] == '%': + tokens = [tokens[0]] + cPDFTokenizer(StringIO(tokens[0][1][1:])).Tokens() + tokens[1:] + value.append('%') + else: + value.append(tokens[0][1]) elif value != [] and value[0] == '(' and tokens[0][1] == ')': value.append(tokens[0][1]) dictionary.append((key, value)) @@ -749,7 +769,7 @@ def GetNestedSub(self, dictionary, select): for key, value in dictionary: if key == select: return self.PrettyPrintSubElement('', [select, value]) - if type(value) == type([]) and type(value[0]) == type((None,)): + if type(value) == type([]) and len(value) > 0 and type(value[0]) == type((None,)): result = self.GetNestedSub(value, select) if result !=None: return self.PrettyPrintSubElement('', [select, result]) @@ -1085,17 +1105,30 @@ def ProcessAt(argument): else: return [argument] -def YARACompile(fileordirname): - dFilepaths = {} - if os.path.isdir(fileordirname): - for root, dirs, files in os.walk(fileordirname): - for file in files: - filename = os.path.join(root, file) - dFilepaths[filename] = filename +def YARACompile(ruledata): + if ruledata.startswith('#'): + if ruledata.startswith('#h#'): + rule = binascii.a2b_hex(ruledata[3:]) + elif ruledata.startswith('#b#'): + rule = binascii.a2b_base64(ruledata[3:]) + elif ruledata.startswith('#s#'): + rule = 'rule string {strings: $a = "%s" ascii wide nocase condition: $a}' % ruledata[3:] + elif ruledata.startswith('#q#'): + rule = ruledata[3:].replace("'", '"') + else: + rule = ruledata[1:] + return yara.compile(source=rule) else: - for filename in ProcessAt(fileordirname): - dFilepaths[filename] = filename - return yara.compile(filepaths=dFilepaths) + dFilepaths = {} + if os.path.isdir(ruledata): + for root, dirs, files in os.walk(ruledata): + for file in files: + filename = os.path.join(root, file) + dFilepaths[filename] = filename + else: + for filename in ProcessAt(ruledata): + dFilepaths[filename] = filename + return yara.compile(filepaths=dFilepaths) def AddDecoder(cClass): global decoders @@ -1447,7 +1480,8 @@ def Main(): print('StartXref: %s' % cntStartXref) print('Indirect object: %s' % cntIndirectObject) names = dicObjectTypes.keys() - for key in sorted(names): + names.sort() + for key in names: print(' %s %d: %s' % (key, len(dicObjectTypes[key]), ', '.join(map(lambda x: '%d' % x, dicObjectTypes[key])))) if options.generate or options.generateembedded != 0: diff --git a/pdfid.py b/pdfid.py index d911d57..2cba9d5 100644 --- a/pdfid.py +++ b/pdfid.py @@ -2,8 +2,8 @@ __description__ = 'Tool to test a PDF file' __author__ = 'Didier Stevens' -__version__ = '0.2.1' -__date__ = '2014/10/18' +__version__ = '0.2.4' +__date__ = '2018/01/29' """ @@ -45,6 +45,14 @@ 2014/09/30: added CSV header 2014/10/16: V0.2.1: added output when plugin & file not pdf 2014/10/18: some fixes for Python 3 + 2015/08/12: V0.2.2: added option pluginoptions + 2015/08/13: added plugin Instructions method + 2016/04/12: added option literal + 2017/10/29: added pdfid.ini support + 2017/11/05: V0.2.3: added option -n + 2018/01/03: V0.2.4: bugfix entropy calculation for PDFs without streams; sample 28cb208d976466b295ee879d2d233c8a https://twitter.com/DubinRan/status/947783629123416069 + 2018/01/15: bugfix ConfigParser privately reported + 2018/01/29: bugfix oPDFEOF.cntCharsAfterLastEOF when no %%EOF Todo: - update XML example (entropy, EOF) @@ -64,14 +72,14 @@ import zipfile import collections import glob -try: - import urllib2 - urllib23 = urllib2 -except: - import urllib.request - urllib23 = urllib.request - -plugins = [] +if sys.version_info[0] >= 3: + import urllib.request as urllib23 +else: + import urllib2 as urllib23 +if sys.version_info[0] >= 3: + import configparser as ConfigParser +else: + import ConfigParser #Convert 2 Bytes If Python 3 def C2BIP3(string): @@ -236,7 +244,10 @@ def calc(self): allCount = sum(self.allBucket) streamCount = sum(self.streamBucket) nonStreamCount = sum(self.nonStreamBucket) - return (allCount, sum(map(lambda x: fEntropy(x, allCount), self.allBucket)), streamCount, sum(map(lambda x: fEntropy(x, streamCount), self.streamBucket)), nonStreamCount, sum(map(lambda x: fEntropy(x, nonStreamCount), self.nonStreamBucket))) + if streamCount == 0: + return (allCount, sum(map(lambda x: fEntropy(x, allCount), self.allBucket)), streamCount, None, nonStreamCount, sum(map(lambda x: fEntropy(x, nonStreamCount), self.nonStreamBucket))) + else: + return (allCount, sum(map(lambda x: fEntropy(x, allCount), self.allBucket)), streamCount, sum(map(lambda x: fEntropy(x, streamCount), self.streamBucket)), nonStreamCount, sum(map(lambda x: fEntropy(x, nonStreamCount), self.nonStreamBucket))) class cPDFEOF: def __init__(self): @@ -348,6 +359,18 @@ def XMLAddAttribute(xmlDoc, name, value=None): xmlDoc.documentElement.setAttributeNode(att) if value != None: att.nodeValue = value + return att + +def ParseINIFile(): + oConfigParser = ConfigParser.ConfigParser(allow_no_value=True) + oConfigParser.optionxform = str + oConfigParser.read(os.path.join(os.path.dirname(sys.argv[0]), 'pdfid.ini')) + keywords = [] + if oConfigParser.has_section('keywords'): + for key, value in oConfigParser.items('keywords'): + if not key in keywords: + keywords.append(key) + return keywords def PDFiD(file, allNames=False, extraData=False, disarm=False, force=False): """Example of XML output: @@ -379,7 +402,7 @@ def PDFiD(file, allNames=False, extraData=False, disarm=False, force=False): hexcode = False lastName = '' insideStream = False - keywords = ('obj', + keywords = ['obj', 'endobj', 'stream', 'endstream', @@ -399,9 +422,12 @@ def PDFiD(file, allNames=False, extraData=False, disarm=False, force=False): '/Launch', '/EmbeddedFile', '/XFA', - ) + ] words = {} dates = [] + for extrakeyword in ParseINIFile(): + if not extrakeyword in keywords: + keywords.append(extrakeyword) for keyword in keywords: words[keyword] = [0, 0] slash = '' @@ -534,7 +560,10 @@ def PDFiD(file, allNames=False, extraData=False, disarm=False, force=False): (countAll, entropyAll , countStream, entropyStream, countNonStream, entropyNonStream) = oEntropy.calc() attEntropyAll.nodeValue = '%f' % entropyAll attCountAll.nodeValue = '%d' % countAll - attEntropyStream.nodeValue = '%f' % entropyStream + if entropyStream == None: + attEntropyStream.nodeValue = 'N/A ' + else: + attEntropyStream.nodeValue = '%f' % entropyStream attCountStream.nodeValue = '%d' % countStream attEntropyNonStream.nodeValue = '%f' % entropyNonStream attCountNonStream.nodeValue = '%d' % countNonStream @@ -551,7 +580,10 @@ def PDFiD(file, allNames=False, extraData=False, disarm=False, force=False): xmlDoc.documentElement.setAttributeNode(attCountCharsAfterLastEOF) if oPDFEOF != None: attCountEOF.nodeValue = '%d' % oPDFEOF.cntEOFs - attCountCharsAfterLastEOF.nodeValue = '%d' % oPDFEOF.cntCharsAfterLastEOF + if oPDFEOF.cntEOFs > 0: + attCountCharsAfterLastEOF.nodeValue = '%d' % oPDFEOF.cntCharsAfterLastEOF + else: + attCountCharsAfterLastEOF.nodeValue = '' else: attCountEOF.nodeValue = '' attCountCharsAfterLastEOF.nodeValue = '' @@ -610,7 +642,7 @@ def PDFiD(file, allNames=False, extraData=False, disarm=False, force=False): eleDate.setAttributeNode(att) return xmlDoc -def PDFiD2String(xmlDoc, force): +def PDFiD2String(xmlDoc, nozero, force): result = 'PDFiD %s %s\n' % (xmlDoc.documentElement.getAttribute('Version'), xmlDoc.documentElement.getAttribute('Filename')) if xmlDoc.documentElement.getAttribute('ErrorOccured') == 'True': return result + '***Error occured***\n%s\n' % xmlDoc.documentElement.getAttribute('ErrorMessage') @@ -618,10 +650,11 @@ def PDFiD2String(xmlDoc, force): return result + ' Not a PDF document\n' result += ' PDF Header: %s\n' % xmlDoc.documentElement.getAttribute('Header') for node in xmlDoc.documentElement.getElementsByTagName('Keywords')[0].childNodes: - result += ' %-16s %7d' % (node.getAttribute('Name'), int(node.getAttribute('Count'))) - if int(node.getAttribute('HexcodeCount')) > 0: - result += '(%d)' % int(node.getAttribute('HexcodeCount')) - result += '\n' + if not nozero or nozero and int(node.getAttribute('Count')) > 0: + result += ' %-16s %7d' % (node.getAttribute('Name'), int(node.getAttribute('Count'))) + if int(node.getAttribute('HexcodeCount')) > 0: + result += '(%d)' % int(node.getAttribute('HexcodeCount')) + result += '\n' if xmlDoc.documentElement.getAttribute('CountEOF') != '': result += ' %-16s %7d\n' % ('%%EOF', int(xmlDoc.documentElement.getAttribute('CountEOF'))) if xmlDoc.documentElement.getAttribute('CountCharsAfterLastEOF') != '': @@ -705,7 +738,7 @@ def MakeCSVLine(fields, separator=';', quote='"'): def ProcessFile(filename, options, plugins): xmlDoc = PDFiD(filename, options.all, options.extra, options.disarm, options.force) if plugins == [] and options.select == '': - Print(PDFiD2String(xmlDoc, options.force), options) + Print(PDFiD2String(xmlDoc, options.nozero, options.force), options) return oPDFiD = cPDFiD(xmlDoc, options.force) @@ -723,12 +756,12 @@ def ProcessFile(filename, options, plugins): if options.csv: Print(filename, options) else: - Print(PDFiD2String(xmlDoc, options.force), options) + Print(PDFiD2String(xmlDoc, options.nozero, options.force), options) else: for cPlugin in plugins: if not cPlugin.onlyValidPDF or not oPDFiD.errorOccured and oPDFiD.isPDF: try: - oPlugin = cPlugin(oPDFiD) + oPlugin = cPlugin(oPDFiD, options.pluginoptions) except Exception as e: Print('Error instantiating plugin: %s' % cPlugin.name, options) if options.verbose: @@ -748,8 +781,12 @@ def ProcessFile(filename, options, plugins): Print(MakeCSVLine((('%s', filename), ('%s', cPlugin.name), ('%.02f', score))), options) else: if score >= options.minimumscore: - Print(PDFiD2String(xmlDoc, options.force), options) - Print('%s score: %.02f' % (cPlugin.name, score), options) + Print(PDFiD2String(xmlDoc, options.nozero, options.force), options) + Print('%s score: %.02f' % (cPlugin.name, score), options) + try: + Print('%s instructions: %s' % (cPlugin.name, oPlugin.Instructions(score)), options) + except AttributeError: + pass else: if options.csv: if oPDFiD.errorOccured: @@ -757,7 +794,7 @@ def ProcessFile(filename, options, plugins): if not oPDFiD.isPDF: Print(MakeCSVLine((('%s', filename), ('%s', cPlugin.name), ('%s', 'Not a PDF document'))), options) else: - Print(PDFiD2String(xmlDoc, options.force), options) + Print(PDFiD2String(xmlDoc, options.nozero, options.force), options) def Scan(directory, options, plugins): @@ -909,7 +946,10 @@ def Main(): oParser.add_option('-m', '--minimumscore', type=float, default=0.0, help='minimum score for plugin results output') oParser.add_option('-v', '--verbose', action='store_true', default=False, help='verbose (will also raise catched exceptions)') oParser.add_option('-S', '--select', type=str, default='', help='selection expression') + oParser.add_option('-n', '--nozero', action='store_true', default=False, help='supress output for counts equal to zero') oParser.add_option('-o', '--output', type=str, default='', help='output to log file') + oParser.add_option('--pluginoptions', type=str, default='', help='options for the plugin') + oParser.add_option('-l', '--literal', action='store_true', default=False, help='take filenames literally, no wildcards') (options, args) = oParser.parse_args() if len(args) == 0: @@ -920,6 +960,8 @@ def Main(): print('Option scan not supported with stdin') options.scan = False filenames = [''] + elif options.literal: + filenames = args else: try: filenames = ExpandFilenameArguments(args) diff --git a/plugin_embeddedfile.py b/plugin_embeddedfile.py index 6980249..2d68bcb 100644 --- a/plugin_embeddedfile.py +++ b/plugin_embeddedfile.py @@ -1,7 +1,6 @@ #!/usr/bin/env python #2014/10/13 -from .pdfid import cPluginParent, AddPlugin class cPDFiDEmbeddedFile(cPluginParent): # onlyValidPDF = True diff --git a/plugin_nameobfuscation.py b/plugin_nameobfuscation.py index 7a40fcd..e116da2 100644 --- a/plugin_nameobfuscation.py +++ b/plugin_nameobfuscation.py @@ -2,7 +2,6 @@ #2013/11/04 #2013/11/08 -from .pdfid import cPluginParent, AddPlugin class cPDFiDNameObfuscation(cPluginParent): # onlyValidPDF = True diff --git a/plugin_triage.py b/plugin_triage.py index 477c5e5..b619bae 100644 --- a/plugin_triage.py +++ b/plugin_triage.py @@ -1,23 +1,51 @@ #!/usr/bin/env python #2014/09/30 -from .pdfid import cPluginParent, AddPlugin +#2015/08/12 added options; changed scoring: /ObjStm 0.75; obj/endobj or stream/endstream discrepancy: 0.50 +#2015/08/13 added instructions +#2017/10/29 added /URI class cPDFiDTriage(cPluginParent): -# onlyValidPDF = True + onlyValidPDF = False name = 'Triage plugin' - def __init__(self, oPDFiD): + def __init__(self, oPDFiD, options): + self.options = options self.oPDFiD = oPDFiD def Score(self): - for keyword in ('/ObjStm', '/JS', '/JavaScript', '/AA', '/OpenAction', '/AcroForm', '/JBIG2Decode', '/RichMedia', '/Launch', '/EmbeddedFile', '/XFA', '/Colors > 2^24'): + for keyword in ('/JS', '/JavaScript', '/AA', '/OpenAction', '/AcroForm', '/JBIG2Decode', '/RichMedia', '/Launch', '/EmbeddedFile', '/XFA', '/Colors > 2^24'): if keyword in self.oPDFiD.keywords and self.oPDFiD.keywords[keyword].count > 0: return 1.0 - if self.oPDFiD.keywords['obj'].count != self.oPDFiD.keywords['endobj'].count: - return 1.0 - if self.oPDFiD.keywords['stream'].count != self.oPDFiD.keywords['endstream'].count: - return 1.0 + if self.options != '--io': + for keyword in ('/ObjStm', ): + if keyword in self.oPDFiD.keywords and self.oPDFiD.keywords[keyword].count > 0: + return 0.75 + for keyword in ('/URI', ): + if keyword in self.oPDFiD.keywords and self.oPDFiD.keywords[keyword].count > 0: + return 0.6 + if self.oPDFiD.keywords['obj'].count != self.oPDFiD.keywords['endobj'].count: + return 0.5 + if self.oPDFiD.keywords['stream'].count != self.oPDFiD.keywords['endstream'].count: + return 0.5 return 0.0 + def Instructions(self, score): + if score == 1.0: + return 'Sample is likely malicious and requires further analysis' + + if score == 0.75: + return '/ObjStm detected, analyze sample with pdfid-objstm.bat' + + if score == 0.5: + return 'Sample is likely not malicious but requires further analysis' + + if score == 0.6: + return 'Sample is likely not malicious but could contain phishing or payload URL' + + if score == 0.0: + return 'Sample is likely not malicious, unless you suspect this is used in a targeted/sophisticated attack' + + return '' + AddPlugin(cPDFiDTriage)