Skip to content
This repository was archived by the owner on Jun 7, 2023. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 52 additions & 18 deletions pdf-parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@

__description__ = 'pdf-parser, use it to parse a PDF document'
__author__ = 'Didier Stevens'
__version__ = '0.6.7'
__date__ = '2016/12/17'
__version__ = '0.6.8'
__date__ = '2017/10/29'
__minimum_python_version__ = (2, 5, 1)
__maximum_python_version__ = (3, 4, 3)
__maximum_python_version__ = (3, 6, 3)

"""
Source code put in public domain by Didier Stevens, no Copyright
Expand Down Expand Up @@ -58,6 +58,9 @@
2016/07/27: V0.6.5 bugfix whitespace 0x00 0x0C after stream 0x0D 0x0A reported by @mr_me
2016/11/20: V0.6.6 added workaround zlib errors FlateDecode
2016/12/17: V0.6.7 added option -k
2017/01/07: V0.6.8 changed cPDFParseDictionary to handle strings () with % character
2017/10/28: fixed bug
2017/10/29: added # support for option -y

Todo:
- handle printf todo
Expand Down Expand Up @@ -146,7 +149,9 @@ def Obj2Str(content):
class cPDFDocument:
def __init__(self, file):
self.file = file
if file.lower().startswith('http://') or file.lower().startswith('https://'):
if type(file) != str:
self.infile = file
elif file.lower().startswith('http://') or file.lower().startswith('https://'):
try:
if sys.hexversion >= 0x020601F0:
self.infile = urllib23.urlopen(file, timeout=5)
Expand Down Expand Up @@ -275,6 +280,14 @@ def TokenIgnoreWhiteSpace(self):
token = self.Token()
return token

def Tokens(self):
tokens = []
token = self.Token()
while token != None:
tokens.append(token)
token = self.Token()
return tokens

def unget(self, byte):
self.ungetted.append(byte)

Expand Down Expand Up @@ -643,7 +656,7 @@ def __init__(self, content, nocanonicalizedoutput):
dataTrimmed = TrimLWhiteSpace(TrimRWhiteSpace(self.content))
if dataTrimmed == []:
self.parsed = None
elif self.isOpenDictionary(dataTrimmed[0]) and self.isCloseDictionary(dataTrimmed[-1]):
elif self.isOpenDictionary(dataTrimmed[0]) and (self.isCloseDictionary(dataTrimmed[-1]) or self.couldBeCloseDictionary(dataTrimmed[-1])):
self.parsed = self.ParseDictionary(dataTrimmed)[0]
else:
self.parsed = None
Expand All @@ -654,6 +667,9 @@ def isOpenDictionary(self, token):
def isCloseDictionary(self, token):
return token[0] == CHAR_DELIMITER and token[1] == '>>'

def couldBeCloseDictionary(self, token):
return token[0] == CHAR_DELIMITER and token[1].rstrip().endswith('>>')

def ParseDictionary(self, tokens):
state = 0 # start
dictionary = []
Expand Down Expand Up @@ -694,7 +710,11 @@ def ParseDictionary(self, tokens):
elif value == [] and tokens[0][1] == '(':
value.append(tokens[0][1])
elif value != [] and value[0] == '(' and tokens[0][1] != ')':
value.append(tokens[0][1])
if tokens[0][1][0] == '%':
tokens = [tokens[0]] + cPDFTokenizer(StringIO(tokens[0][1][1:])).Tokens() + tokens[1:]
value.append('%')
else:
value.append(tokens[0][1])
elif value != [] and value[0] == '(' and tokens[0][1] == ')':
value.append(tokens[0][1])
dictionary.append((key, value))
Expand Down Expand Up @@ -749,7 +769,7 @@ def GetNestedSub(self, dictionary, select):
for key, value in dictionary:
if key == select:
return self.PrettyPrintSubElement('', [select, value])
if type(value) == type([]) and type(value[0]) == type((None,)):
if type(value) == type([]) and len(value) > 0 and type(value[0]) == type((None,)):
result = self.GetNestedSub(value, select)
if result !=None:
return self.PrettyPrintSubElement('', [select, result])
Expand Down Expand Up @@ -1085,17 +1105,30 @@ def ProcessAt(argument):
else:
return [argument]

def YARACompile(fileordirname):
dFilepaths = {}
if os.path.isdir(fileordirname):
for root, dirs, files in os.walk(fileordirname):
for file in files:
filename = os.path.join(root, file)
dFilepaths[filename] = filename
def YARACompile(ruledata):
if ruledata.startswith('#'):
if ruledata.startswith('#h#'):
rule = binascii.a2b_hex(ruledata[3:])
elif ruledata.startswith('#b#'):
rule = binascii.a2b_base64(ruledata[3:])
elif ruledata.startswith('#s#'):
rule = 'rule string {strings: $a = "%s" ascii wide nocase condition: $a}' % ruledata[3:]
elif ruledata.startswith('#q#'):
rule = ruledata[3:].replace("'", '"')
else:
rule = ruledata[1:]
return yara.compile(source=rule)
else:
for filename in ProcessAt(fileordirname):
dFilepaths[filename] = filename
return yara.compile(filepaths=dFilepaths)
dFilepaths = {}
if os.path.isdir(ruledata):
for root, dirs, files in os.walk(ruledata):
for file in files:
filename = os.path.join(root, file)
dFilepaths[filename] = filename
else:
for filename in ProcessAt(ruledata):
dFilepaths[filename] = filename
return yara.compile(filepaths=dFilepaths)

def AddDecoder(cClass):
global decoders
Expand Down Expand Up @@ -1447,7 +1480,8 @@ def Main():
print('StartXref: %s' % cntStartXref)
print('Indirect object: %s' % cntIndirectObject)
names = dicObjectTypes.keys()
for key in sorted(names):
names.sort()
for key in names:
print(' %s %d: %s' % (key, len(dicObjectTypes[key]), ', '.join(map(lambda x: '%d' % x, dicObjectTypes[key]))))

if options.generate or options.generateembedded != 0:
Expand Down
94 changes: 68 additions & 26 deletions pdfid.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

__description__ = 'Tool to test a PDF file'
__author__ = 'Didier Stevens'
__version__ = '0.2.1'
__date__ = '2014/10/18'
__version__ = '0.2.4'
__date__ = '2018/01/29'

"""

Expand Down Expand Up @@ -45,6 +45,14 @@
2014/09/30: added CSV header
2014/10/16: V0.2.1: added output when plugin & file not pdf
2014/10/18: some fixes for Python 3
2015/08/12: V0.2.2: added option pluginoptions
2015/08/13: added plugin Instructions method
2016/04/12: added option literal
2017/10/29: added pdfid.ini support
2017/11/05: V0.2.3: added option -n
2018/01/03: V0.2.4: bugfix entropy calculation for PDFs without streams; sample 28cb208d976466b295ee879d2d233c8a https://twitter.com/DubinRan/status/947783629123416069
2018/01/15: bugfix ConfigParser privately reported
2018/01/29: bugfix oPDFEOF.cntCharsAfterLastEOF when no %%EOF

Todo:
- update XML example (entropy, EOF)
Expand All @@ -64,14 +72,14 @@
import zipfile
import collections
import glob
try:
import urllib2
urllib23 = urllib2
except:
import urllib.request
urllib23 = urllib.request

plugins = []
if sys.version_info[0] >= 3:
import urllib.request as urllib23
else:
import urllib2 as urllib23
if sys.version_info[0] >= 3:
import configparser as ConfigParser
else:
import ConfigParser

#Convert 2 Bytes If Python 3
def C2BIP3(string):
Expand Down Expand Up @@ -236,7 +244,10 @@ def calc(self):
allCount = sum(self.allBucket)
streamCount = sum(self.streamBucket)
nonStreamCount = sum(self.nonStreamBucket)
return (allCount, sum(map(lambda x: fEntropy(x, allCount), self.allBucket)), streamCount, sum(map(lambda x: fEntropy(x, streamCount), self.streamBucket)), nonStreamCount, sum(map(lambda x: fEntropy(x, nonStreamCount), self.nonStreamBucket)))
if streamCount == 0:
return (allCount, sum(map(lambda x: fEntropy(x, allCount), self.allBucket)), streamCount, None, nonStreamCount, sum(map(lambda x: fEntropy(x, nonStreamCount), self.nonStreamBucket)))
else:
return (allCount, sum(map(lambda x: fEntropy(x, allCount), self.allBucket)), streamCount, sum(map(lambda x: fEntropy(x, streamCount), self.streamBucket)), nonStreamCount, sum(map(lambda x: fEntropy(x, nonStreamCount), self.nonStreamBucket)))

class cPDFEOF:
def __init__(self):
Expand Down Expand Up @@ -348,6 +359,18 @@ def XMLAddAttribute(xmlDoc, name, value=None):
xmlDoc.documentElement.setAttributeNode(att)
if value != None:
att.nodeValue = value
return att

def ParseINIFile():
oConfigParser = ConfigParser.ConfigParser(allow_no_value=True)
oConfigParser.optionxform = str
oConfigParser.read(os.path.join(os.path.dirname(sys.argv[0]), 'pdfid.ini'))
keywords = []
if oConfigParser.has_section('keywords'):
for key, value in oConfigParser.items('keywords'):
if not key in keywords:
keywords.append(key)
return keywords

def PDFiD(file, allNames=False, extraData=False, disarm=False, force=False):
"""Example of XML output:
Expand Down Expand Up @@ -379,7 +402,7 @@ def PDFiD(file, allNames=False, extraData=False, disarm=False, force=False):
hexcode = False
lastName = ''
insideStream = False
keywords = ('obj',
keywords = ['obj',
'endobj',
'stream',
'endstream',
Expand All @@ -399,9 +422,12 @@ def PDFiD(file, allNames=False, extraData=False, disarm=False, force=False):
'/Launch',
'/EmbeddedFile',
'/XFA',
)
]
words = {}
dates = []
for extrakeyword in ParseINIFile():
if not extrakeyword in keywords:
keywords.append(extrakeyword)
for keyword in keywords:
words[keyword] = [0, 0]
slash = ''
Expand Down Expand Up @@ -534,7 +560,10 @@ def PDFiD(file, allNames=False, extraData=False, disarm=False, force=False):
(countAll, entropyAll , countStream, entropyStream, countNonStream, entropyNonStream) = oEntropy.calc()
attEntropyAll.nodeValue = '%f' % entropyAll
attCountAll.nodeValue = '%d' % countAll
attEntropyStream.nodeValue = '%f' % entropyStream
if entropyStream == None:
attEntropyStream.nodeValue = 'N/A '
else:
attEntropyStream.nodeValue = '%f' % entropyStream
attCountStream.nodeValue = '%d' % countStream
attEntropyNonStream.nodeValue = '%f' % entropyNonStream
attCountNonStream.nodeValue = '%d' % countNonStream
Expand All @@ -551,7 +580,10 @@ def PDFiD(file, allNames=False, extraData=False, disarm=False, force=False):
xmlDoc.documentElement.setAttributeNode(attCountCharsAfterLastEOF)
if oPDFEOF != None:
attCountEOF.nodeValue = '%d' % oPDFEOF.cntEOFs
attCountCharsAfterLastEOF.nodeValue = '%d' % oPDFEOF.cntCharsAfterLastEOF
if oPDFEOF.cntEOFs > 0:
attCountCharsAfterLastEOF.nodeValue = '%d' % oPDFEOF.cntCharsAfterLastEOF
else:
attCountCharsAfterLastEOF.nodeValue = ''
else:
attCountEOF.nodeValue = ''
attCountCharsAfterLastEOF.nodeValue = ''
Expand Down Expand Up @@ -610,18 +642,19 @@ def PDFiD(file, allNames=False, extraData=False, disarm=False, force=False):
eleDate.setAttributeNode(att)
return xmlDoc

def PDFiD2String(xmlDoc, force):
def PDFiD2String(xmlDoc, nozero, force):
result = 'PDFiD %s %s\n' % (xmlDoc.documentElement.getAttribute('Version'), xmlDoc.documentElement.getAttribute('Filename'))
if xmlDoc.documentElement.getAttribute('ErrorOccured') == 'True':
return result + '***Error occured***\n%s\n' % xmlDoc.documentElement.getAttribute('ErrorMessage')
if not force and xmlDoc.documentElement.getAttribute('IsPDF') == 'False':
return result + ' Not a PDF document\n'
result += ' PDF Header: %s\n' % xmlDoc.documentElement.getAttribute('Header')
for node in xmlDoc.documentElement.getElementsByTagName('Keywords')[0].childNodes:
result += ' %-16s %7d' % (node.getAttribute('Name'), int(node.getAttribute('Count')))
if int(node.getAttribute('HexcodeCount')) > 0:
result += '(%d)' % int(node.getAttribute('HexcodeCount'))
result += '\n'
if not nozero or nozero and int(node.getAttribute('Count')) > 0:
result += ' %-16s %7d' % (node.getAttribute('Name'), int(node.getAttribute('Count')))
if int(node.getAttribute('HexcodeCount')) > 0:
result += '(%d)' % int(node.getAttribute('HexcodeCount'))
result += '\n'
if xmlDoc.documentElement.getAttribute('CountEOF') != '':
result += ' %-16s %7d\n' % ('%%EOF', int(xmlDoc.documentElement.getAttribute('CountEOF')))
if xmlDoc.documentElement.getAttribute('CountCharsAfterLastEOF') != '':
Expand Down Expand Up @@ -705,7 +738,7 @@ def MakeCSVLine(fields, separator=';', quote='"'):
def ProcessFile(filename, options, plugins):
xmlDoc = PDFiD(filename, options.all, options.extra, options.disarm, options.force)
if plugins == [] and options.select == '':
Print(PDFiD2String(xmlDoc, options.force), options)
Print(PDFiD2String(xmlDoc, options.nozero, options.force), options)
return

oPDFiD = cPDFiD(xmlDoc, options.force)
Expand All @@ -723,12 +756,12 @@ def ProcessFile(filename, options, plugins):
if options.csv:
Print(filename, options)
else:
Print(PDFiD2String(xmlDoc, options.force), options)
Print(PDFiD2String(xmlDoc, options.nozero, options.force), options)
else:
for cPlugin in plugins:
if not cPlugin.onlyValidPDF or not oPDFiD.errorOccured and oPDFiD.isPDF:
try:
oPlugin = cPlugin(oPDFiD)
oPlugin = cPlugin(oPDFiD, options.pluginoptions)
except Exception as e:
Print('Error instantiating plugin: %s' % cPlugin.name, options)
if options.verbose:
Expand All @@ -748,16 +781,20 @@ def ProcessFile(filename, options, plugins):
Print(MakeCSVLine((('%s', filename), ('%s', cPlugin.name), ('%.02f', score))), options)
else:
if score >= options.minimumscore:
Print(PDFiD2String(xmlDoc, options.force), options)
Print('%s score: %.02f' % (cPlugin.name, score), options)
Print(PDFiD2String(xmlDoc, options.nozero, options.force), options)
Print('%s score: %.02f' % (cPlugin.name, score), options)
try:
Print('%s instructions: %s' % (cPlugin.name, oPlugin.Instructions(score)), options)
except AttributeError:
pass
else:
if options.csv:
if oPDFiD.errorOccured:
Print(MakeCSVLine((('%s', filename), ('%s', cPlugin.name), ('%s', 'Error occured'))), options)
if not oPDFiD.isPDF:
Print(MakeCSVLine((('%s', filename), ('%s', cPlugin.name), ('%s', 'Not a PDF document'))), options)
else:
Print(PDFiD2String(xmlDoc, options.force), options)
Print(PDFiD2String(xmlDoc, options.nozero, options.force), options)


def Scan(directory, options, plugins):
Expand Down Expand Up @@ -909,7 +946,10 @@ def Main():
oParser.add_option('-m', '--minimumscore', type=float, default=0.0, help='minimum score for plugin results output')
oParser.add_option('-v', '--verbose', action='store_true', default=False, help='verbose (will also raise catched exceptions)')
oParser.add_option('-S', '--select', type=str, default='', help='selection expression')
oParser.add_option('-n', '--nozero', action='store_true', default=False, help='supress output for counts equal to zero')
oParser.add_option('-o', '--output', type=str, default='', help='output to log file')
oParser.add_option('--pluginoptions', type=str, default='', help='options for the plugin')
oParser.add_option('-l', '--literal', action='store_true', default=False, help='take filenames literally, no wildcards')
(options, args) = oParser.parse_args()

if len(args) == 0:
Expand All @@ -920,6 +960,8 @@ def Main():
print('Option scan not supported with stdin')
options.scan = False
filenames = ['']
elif options.literal:
filenames = args
else:
try:
filenames = ExpandFilenameArguments(args)
Expand Down
1 change: 0 additions & 1 deletion plugin_embeddedfile.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#!/usr/bin/env python

#2014/10/13
from .pdfid import cPluginParent, AddPlugin

class cPDFiDEmbeddedFile(cPluginParent):
# onlyValidPDF = True
Expand Down
1 change: 0 additions & 1 deletion plugin_nameobfuscation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

#2013/11/04
#2013/11/08
from .pdfid import cPluginParent, AddPlugin

class cPDFiDNameObfuscation(cPluginParent):
# onlyValidPDF = True
Expand Down
Loading