filter-repo: rewrite to not use pyparsing in order to avoid memory madness

pyparsing sucks a whole file into memory at a time and then parses, which is really bad in this case since the output from git-fast-export is huge. I entered disk swapping madness pretty easily. So, now I just do my own manual parsing. Signed-off-by: Elijah Newren <newren@gmail.com>
16 years ago · 471e9d8684
parent ae486e85b8
commit 471e9d8684
1 changed files with 129 additions and 221 deletions
--- a/350
+++ b/350
@ -1,58 +1,12 @@
 import os
 import re
 import sys
-from subprocess import Popen, PIPE
-from pyparsing import ParserElement, Literal, Optional, Combine, Word, nums, \
-                      Regex, ZeroOrMore, OneOrMore, CharsNotIn, \
-                      dblQuotedString, \
-                      ParseException, ParseSyntaxException
-
-from pyparsing import Token, ParseResults
+from subprocess import Popen, PIPE, call
+from email.utils import unquote

 __all__ = ["Blob", "Reset", "FileChanges", "Commit", "get_total_commits",
           "FastExportFilter", "FastExportOuput", "FastImportInput"]

-class ExactData(Token):
-  """Specialized pyparsing subclass for handling data dumps in git-fast-import
-     exact data format"""
-  def __init__( self ):
-    super(ExactData,self).__init__()
-
-    self.pattern = r"data (\d+)\n"
-    self.re = re.compile(self.pattern)
-    self.reString = self.pattern
-
-    self.name = "ExactData"
-    self.errmsg = "Expected " + self.name
-    self.mayIndexError = False
-    self.mayReturnEmpty = True
-
-  def parseImpl( self, instring, loc, doActions=True ):
-    result = self.re.match(instring,loc)
-    if not result:
-      exc = self.myException
-      exc.loc = loc
-      exc.pstr = instring
-      raise exc
-
-    num = result.group(1)
-    loc = result.end()+int(num)
-    data = instring[result.end():loc]
-    d = result.groupdict()
-    ret = ParseResults(['data', num, data])
-    return loc,ret
-
-  def __str__( self ):
-    try:
-      return super(ExactMath,self).__str__()
-    except:
-      pass
-
-    if self.strRepr is None:
-      self.strRepr = "Data:"
-
-    return self.strRepr
-
 class IDs(object):
  def __init__(self):
    self.count = 0
@ -137,6 +91,8 @@ class FileChanges(GitElement):
      file.write('M %s :%d %s\n' % (self.mode, self.id, self.filename))
    elif self.type == 'D':
      file.write('D %s\n' % self.filename)
+    else:
+      raise SystemExit("Unhandled filechange type: %s" % self.type)

 class Commit(GitElement):
  def __init__(self, branch,
@ -187,7 +143,6 @@ class FastExportFilter(object):
               blob_callback = None,  progress_callback = None,
               reset_callback = None, checkpoint_callback = None,
               everything_callback = None):
-    self._setup_parser()
    self.tag_callback        = tag_callback
    self.blob_callback       = blob_callback
    self.reset_callback      = reset_callback
@ -196,17 +151,79 @@ class FastExportFilter(object):
    self.checkpoint_callback = checkpoint_callback
    self.everything_callback = everything_callback

+    self.input = None
    self.output = sys.stdout
-
-  def _make_blob(self, t):
-    # Create the Blob object from the parser tokens
-    id = int(t[1][1:])
-    datalen = int(t[3])
-    data = t[4]
-    if datalen != len(data):
-      raise SystemExit('%d != len(%s)' % datalen, data)
+    self.nextline = ''
+
+  def _advance_nextline(self):
+    self.nextline = self.input.readline()
+
+  def _parse_optional_mark(self):
+    mark = None
+    matches = re.match('mark :(\d+)\n$', self.nextline)
+    if matches:
+      mark = int(matches.group(1))
+      self._advance_nextline()
+    return mark
+
+  def _parse_optional_baseref(self, refname):
+    baseref = None
+    matches = re.match('%s :(\d+)\n' % refname, self.nextline)
+    if matches:
+      baseref = ids.translate( int(matches.group(1)) )
+      self._advance_nextline()
+    return baseref
+
+  def _parse_optional_filechange(self):
+    filechange = None
+    if self.nextline.startswith('M '):
+      (mode, idnum, path) = \
+        re.match('M (\d+) :(\d+) (.*)\n$', self.nextline).groups()
+      idnum = int(idnum)
+      if path.startswith('"'):
+        path = unquote(path)
+      filechange = FileChanges('M', path, mode, idnum)
+      self._advance_nextline()
+    elif self.nextline.startswith('D '):
+      path = self.nextline[2:-1]
+      if path.startswith('"'):
+        path = unquote(path)
+      filechange = FileChanges('D', path)
+      self._advance_nextline()
+    return filechange
+
+  def _parse_ref_line(self, refname):
+    matches = re.match('%s (.*)\n$' % refname, self.nextline)
+    if not matches:
+      raise SystemExit("Malformed %s line: '%s'" % (refname, self.nextline))
+    ref = matches.group(1)
+    self._advance_nextline()
+    return ref
+
+  def _parse_user(self, usertype):
+    (name, email, when) = \
+      re.match('%s (.*?) <(.*?)> (.*)\n$' % usertype, self.nextline).groups()
+    self._advance_nextline()
+    return (name, email, when)
+
+  def _parse_data(self):
+    size = int(re.match('data (\d+)\n$', self.nextline).group(1))
+    data = self.input.read(size)
+    self._advance_nextline()
+    return data
+
+  def _parse_blob(self):
+    # Parse the Blob
+    self._advance_nextline()
+    id = self._parse_optional_mark()
+    data = self._parse_data()
+    if self.nextline == '\n':
+      self._advance_nextline()
+
+    # Create the blob
    blob = Blob(data)
-    ids.record_rename(id, blob.id)
+    if id:
+      ids.record_rename(id, blob.id)

    # Call any user callback to allow them to modify the blob
    if self.blob_callback:
@ -214,19 +231,17 @@ class FastExportFilter(object):
    if self.everything_callback:
      self.everything_callback('blob', blob)

-    # Now print the resulting blob to stdout
+    # Now print the resulting blob
    blob.dump(self.output)

-    # We don't need the parser tokens anymore
-    return []
+  def _parse_reset(self):
+    # Parse the Reset
+    ref = self._parse_ref_line('reset')
+    from_ref = self._parse_optional_baseref('from')
+    if self.nextline == '\n':
+      self._advance_nextline()

-  def _make_reset(self, t):
-    # Create the Reset object from the parser tokens
-    ref = t[1]
-    from_ref = None
-    if len(t) > 2:
-      old_id = int(t[3][1:])
-      from_ref = ids.translate(old_id)
+    # Create the reset
    reset = Reset(ref, from_ref)

    # Call any user callback to allow them to modify the reset
@ -235,88 +250,47 @@ class FastExportFilter(object):
    if self.everything_callback:
      self.everything_callback('reset', reset)

-    # Now print the resulting reset to stdout
+    # Now print the resulting reset
    reset.dump(self.output)

-    # We don't need the parser tokens anymore
-    return []
-
-  def _make_file_changes(self, t):
-    if t[0] == 'M':
-      mode = t[1]
-      old_id = int(t[2][1:])
-      id = ids.translate(old_id)
-
-      filename = t[3]
-      return FileChanges(t[0], filename, mode, id)
-    elif t[0] == 'D':
-      filename = t[1]
-      return FileChanges(t[0], filename)
-
-  def _make_commit(self, t):
-    #
-    # Create the Commit object from the parser tokens...
-    #
-
-    # Get the branch
-    branch = t[1]
-    loc = 2
-    tlen = len(t)
-
-    # Get the optional mark
-    id = None
-    if t[loc].startswith(':'):
-      id = int(t[loc][1:])
-      loc += 1
-
-    # Get the committer; we'll get back to the author in a minute
-    offset = (t[loc] == 'author') and loc+4 or loc
-    committer_name  = t[offset+1]
-    committer_email = t[offset+2]
-    committer_date  = t[offset+3]
-
-    # Get the optional author
-    if t[loc] == 'author':
-      author_name  = t[loc+1]
-      author_email = t[loc+2]
-      author_date  = t[loc+3]
-      loc += 8
-    else:
-      author_name  = committer_name
-      author_email = committer_email
-      author_date  = committer_date
-      loc += 4
-
-    # Get the commit message
-    messagelen = int(t[loc+1])
-    message = t[loc+2] # Skip 'data' and len(message)
-    if messagelen != len(message):
-      raise SystemExit("Commit message's length mismatch; %d != len(%s)" % \
-                       messagelen, message)
-    loc += 3
-
-    # Get the commit we're supposed to be based on, if other than HEAD
-    from_commit = None
-    if loc < tlen and t[loc] == 'from':
-      old_id = int(t[loc+1][1:])
-      from_commit = ids.translate(old_id)
-      loc += 2
-
-    # Find out if this is a merge commit, and if so what commits other than
-    # HEAD are involved
-    merge_commits = []
-    while loc < tlen and t[loc] == 'merge':
-      merge_commits.append(ids.translate( int(t[loc+1][1:]) ))
-      loc += 2
+  def _parse_commit(self):
+    # Parse the Commit
+    branch = self._parse_ref_line('commit')
+    id = self._parse_optional_mark()

-    # Get file changes
-    file_changes = t[loc:]
+    author_name = None
+    if self.nextline.startswith('author'):
+      (author_name, author_email, author_date) = self._parse_user('author')
+
+    (committer_name, committer_email, committer_date) = \
+      self._parse_user('committer')
+
+    if not author_name:
+      (author_name, author_email, author_date) = \
+        (committer_name, committer_email, committer_date)
+
+    commit_msg = self._parse_data()
+
+    from_commit = self._parse_optional_baseref('from')
+    merge_commits = []
+    merge_ref = self._parse_optional_baseref('merge')
+    while merge_ref:
+      merge_commits.append(merge_ref)
+      merge_ref = self._parse_optional_baseref('merge')
+
+    file_changes = []
+    file_change = self._parse_optional_filechange()
+    while file_change:
+      file_changes.append(file_change)
+      file_change = self._parse_optional_filechange()
+    if self.nextline == '\n':
+      self._advance_nextline()

    # Okay, now we can finally create the Commit object
    commit = Commit(branch,
                    author_name,    author_email,    author_date,
                    committer_name, committer_email, committer_date,
-                    message,
+                    commit_msg,
                    file_changes,
                    from_commit,
                    merge_commits)
@ -332,87 +306,20 @@ class FastExportFilter(object):
    # Now print the resulting commit to stdout
    commit.dump(self.output)

-    # We don't need the parser tokens anymore
-    return []
-
-  def _setup_parser(self):
-    # Basic setup
-    ParserElement.setDefaultWhitespaceChars('')
-    number = Word(nums)
-    lf = Literal('\n').suppress()
-    sp = Literal(' ').suppress()
-
-    # Common constructs -- data, ref startpoints
-    exact_data = ExactData() + Optional(lf)
-    data = exact_data  # FIXME: Should allow delimited_data too
-    from_ref  = Literal('from')  + sp + Regex('.*') + lf
-    merge_ref = Literal('merge') + sp + Regex('.*') + lf
-    person_info = sp + Regex('[^<\n]*(?=[ ])') + sp + \
-                  Literal('<').suppress() + Regex('[^<>\n]*') + \
-                  Literal('>').suppress() + sp + \
-                  Regex('.*') + lf
-
-    # Parsing marks
-    idnum = Combine(Literal(':') + number)
-    mark = Literal('mark').suppress() - sp + idnum + lf
-
-    # Parsing blobs
-    file_content = data
-    blob = Literal('blob') + lf + mark + file_content
-    blob.setParseAction(lambda t: self._make_blob(t))
-
-    # Parsing branch resets
-    reset = Literal('reset') + sp + Regex('.*') + lf + \
-            Optional(from_ref) + Optional(lf)
-    reset.setParseAction(lambda t: self._make_reset(t))
-
-    # Parsing file changes
-    mode = Literal('100644') | Literal('644') | Literal('100755') | \
-           Literal('755') | Literal('120000')
-    path_str = CharsNotIn(' \n') | dblQuotedString
-    file_obm = Literal('M') - sp + mode + sp + idnum + sp + path_str + lf
-    file_del = Literal('D') - sp + path_str + lf
-    file_change = file_obm | file_del
-    #file_change = file_clr|file_del|file_rnm|file_cpy|file_obm|file_inm
-    file_change.setParseAction(lambda t: self._make_file_changes(t))
-
-    # Parsing commits
-    author_info = Literal('author') + person_info
-    committer_info = Literal('committer') + person_info
-    commit_msg = data
-    commit = Literal('commit') + sp + Regex('.*') + lf + \
-             Optional(mark) +                            \
-             Optional(author_info) +                     \
-             committer_info +                            \
-             commit_msg +                                \
-             Optional(from_ref) +                        \
-             ZeroOrMore(merge_ref) +                     \
-             ZeroOrMore(file_change) +                   \
-             Optional(lf)
-    commit.setParseAction(lambda t: self._make_commit(t))
-
-    # Tying it all together
-    cmd = blob | reset | commit
-    self.stream = ZeroOrMore(cmd)
-    self.stream.parseWithTabs()
-
  def run(self, input_file, output_file):
+    self.input = input_file
    if output_file:
      self.output = output_file
-    try:
-      results = self.stream.parseFile(input_file)
-    except ParseException, err:
-      print err.line
-      print " "*(err.column-1) + "^"
-      print err
-      raise SystemExit
-    except ParseSyntaxException, err:
-      print err.line
-      print " "*(err.column-1) + "^"
-      print err
-      raise SystemExit
-    input_file.close()
-    output_file.close()
+    self.nextline = input_file.readline()
+    while self.nextline:
+      if   self.nextline.startswith('blob'):
+        self._parse_blob()
+      elif self.nextline.startswith('reset'):
+        self._parse_reset()
+      elif self.nextline.startswith('commit'):
+        self._parse_commit()
+      else:
+        raise SystemExit("Could not parse line: '%s'" % self.nextline)

 def FastExportOutput(source_repo, extra_args = []):
  return Popen(["git", "fast-export", "--all"] + extra_args,
@ -422,7 +329,8 @@ def FastExportOutput(source_repo, extra_args = []):
 def FastImportInput(target_repo, extra_args = []):
  if not os.path.isdir(target_repo):
    os.makedirs(target_repo)
-    os.waitpid(Popen(["git", "init"], cwd = target_repo).pid, 0)
+    if call(["git", "init"], cwd = target_repo) != 0:
+      raise SystemExit("git init in %s failed!" % target_repo)
  return Popen(["git", "fast-import"] + extra_args,
               stdin = PIPE,
               stderr = PIPE,  # We don't want no stinkin' statistics