filter-repo: add GitElement and Blob classes, and a FastExport Parser class

We still only parse a single blob, but this should put the infrastructure
in place for parsing more output from git-fast-export.

Signed-off-by: Elijah Newren <newren@gmail.com>
pull/13/head
Elijah Newren 16 years ago
parent 2b34e5c25d
commit eb4afc4e78

@ -2,63 +2,126 @@
import commands
import re
import sha # bleh...when can I assume python >= 2.5?
import sys
from pyparsing import ParserElement, Literal, Optional, Combine, Word, nums
from pyparsing import Token, ParseResults
class ExactData(Token):
"""Token for matching data dumps in git-fast-import format"""
def __init__( self ):
super(ExactData,self).__init__()
self.pattern = r"data (\d+)\n"
self.re = re.compile(self.pattern)
self.reString = self.pattern
self.name = "ExactData"
self.errmsg = "Expected " + self.name
#self.myException.msg = self.errmsg
self.mayIndexError = False
self.mayReturnEmpty = True
def parseImpl( self, instring, loc, doActions=True ):
result = self.re.match(instring,loc)
if not result:
exc = self.myException
exc.loc = loc
exc.pstr = instring
raise exc
num = result.group(1)
loc = result.end()+int(num)
data = instring[result.end():loc]
d = result.groupdict()
ret = ParseResults(['data', num, data])
return loc,ret
def __str__( self ):
try:
return super(ExactMath,self).__str__()
except:
pass
if self.strRepr is None:
self.strRepr = "Data:"
return self.strRepr
"""Specialized pyparsing subclass for handling data dumps in git-fast-import
exact data format"""
def __init__( self ):
super(ExactData,self).__init__()
self.pattern = r"data (\d+)\n"
self.re = re.compile(self.pattern)
self.reString = self.pattern
string = commands.getoutput("GIT_DIR=foo/.git git fast-export --all")
self.name = "ExactData"
self.errmsg = "Expected " + self.name
self.mayIndexError = False
self.mayReturnEmpty = True
def parseImpl( self, instring, loc, doActions=True ):
result = self.re.match(instring,loc)
if not result:
exc = self.myException
exc.loc = loc
exc.pstr = instring
raise exc
num = result.group(1)
loc = result.end()+int(num)
data = instring[result.end():loc]
d = result.groupdict()
ret = ParseResults(['data', num, data])
return loc,ret
def __str__( self ):
try:
return super(ExactMath,self).__str__()
except:
pass
if self.strRepr is None:
self.strRepr = "Data:"
return self.strRepr
class GitElement(object):
def __init__(self):
self.type = None
def dump(self):
raise SystemExit("Unimplemented function: %s.dump()", type(self))
class Blob(GitElement):
def __init__(self, mark, data):
GitElement.__init__(self)
self.type = 'blob'
self.mark = mark
self.data = data
def dump(self):
sys.stdout.write('blob\n')
sys.stdout.write('mark :%d\n' % self.mark)
sys.stdout.write('data %d\n%s' % (len(self.data), self.data))
ParserElement.setDefaultWhitespaceChars('')
number = Word(nums)
lf = Literal('\n').suppress()
sp = Literal(' ').suppress()
mark_name = Combine(Literal(':') + number)
mark = Literal('mark').suppress() - sp + mark_name + lf
#exact_data = Literal('data') + sp + number + lf
exact_data = ExactData()
file_content = exact_data
blob = Literal('blob') + lf + mark + file_content
results = blob.parseString(string, parseAll = False)
class FastExportParser(object):
def __init__(self,
tag_callback = None, commit_callback = None,
blob_callback = None, progress_callback = None,
reset_callback = None, checkpoint_callback = None,
everything_callback = None):
self._setup_parser()
self.tag_callback = tag_callback
self.blob_callback = blob_callback
self.reset_callback = reset_callback
self.commit_callback = commit_callback
self.progress_callback = progress_callback
self.checkpoint_callback = checkpoint_callback
self.everything_callback = everything_callback
def _make_blob(self, t):
# Create the Blob object from the parser tokens
mark = int(t[1][1:])
datalen = int(t[3])
data = t[4]
if datalen != len(data):
raise SystemExit('%d != len(%s)' % datalen, data)
blob = Blob(mark, data)
# Call any user callback to allow them to modify the blob
if self.blob_callback:
self.blob_callback(blob)
# Now print the resulting blob to stdout
blob.dump()
# Replace data with its sha1sum to cut down on memory usage
# (python parser stores whole resulting parse tree in memory)
sha1sum = sha.new(blob.data).hexdigest()
return ['blob', blob.mark, len(blob.data), sha1sum]
def _setup_parser(self):
ParserElement.setDefaultWhitespaceChars('')
number = Word(nums)
lf = Literal('\n').suppress()
sp = Literal(' ').suppress()
mark_name = Combine(Literal(':') + number)
mark = Literal('mark').suppress() - sp + mark_name + lf
#exact_data = Literal('data') + sp + number + lf
exact_data = ExactData()
file_content = exact_data
self.blob = Literal('blob') + lf + mark + file_content
self.blob.setParseAction(lambda t: self._make_blob(t))
def parse(self, string):
results = self.blob.parseString(string, parseAll = False)
return results
parser = FastExportParser()
string = commands.getoutput("GIT_DIR=foo/.git git fast-export --all")
results = parser.parse(string)
print results

Loading…
Cancel
Save