You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
git-filter-repo/contrib/filter-repo-demos/convert-svnexternals

603 lines
21 KiB
Python

#!/usr/bin/env python3
"""
This is a program that will insert Git submodules according to SVN externals
definitions (svn:externals properties) from the original Subversion repository
throughout the history.
Information about the externals is obtained from the ".gitsvnextmodules" file
created during SVN-to-Git conversion by SubGit (https://subgit.com/). Its
config option "translate.externals=true" had to be used therefore.
Actual modifications:
- Insert gitlinks (mode 160000) into the tree.
- Add .gitmodules file with relevant sections.
- Remove sections converted to submodules from .gitsvnextmodules file
and delete it if empty.
.gitsvnextmodules example:
[submodule "somedir/extdir"]
path = somedir/extdir
owner = somedir
url = https://svn.example.com/somesvnrepo/trunk
revision = 1234
branch = /
fetch = :refs/remotes/git-svn
remote = svn
type = dir
Resulting addition in "somedir" tree (cat-file pretty-print format):
160000 commit 1234123412341234123412341234123412341234 extdir
Resulting .gitmodules entry:
[submodule "somedir/extdir"]
path = somedir/extdir
url = https://git.example.com/somegitrepo.git
SVN-to-Git mapping file:
Can be created from SubGit's "refs/svn/map".
One line per mapping in following format:
<svn url> TAB <svn rev> TAB <git url> TAB <git commit> TAB <state>
- Leading '#' can be used for comments.
- <svn url> must not contain a trailing slash.
- <state> has to be "commit" to be usable, but can be "missing" if <git commit>
does not exist in the repository anymore. Adopted from git-cat-file output.
Example:
https://svn.example.com/somesvnrepo/trunk 1234 https://git.example.com/somegitrepo.git 1234123412341234123412341234123412341234 commit
Features:
- Repeatedly added/removed externals will be handled properly.
- Externals replaced by directly added files and vice versa will be handled
properly.
Caveats:
- This script must NOT be run repeatedly. A second invocation would lead to a
different result in case the externals could only be converted partially.
- Inconsistent SVN repositories (with failing checkout) not handled, i.e.
- normal directory and external with the same path
- external path not existing for the given revision
- No attention was paid to non-ASCII and special characters in gitlink paths,
might cause problems.
- There is no error handling for mandatory options missing in .gitsvnextmodules
file. The script would crash in case of such buggy files, but that shouldn't
happen in practice.
TODO:
- Add external files directly.
- Alternatively add external directories directly instead of using a submodule.
"""
"""
Please see the
***** API BACKWARD COMPATIBILITY CAVEAT *****
near the top of git-filter-repo.
"""
import argparse
import os
import sys
import shutil
import subprocess
import configparser
from urllib.parse import urlsplit
try:
import git_filter_repo as fr
except ImportError:
raise SystemExit("Error: Couldn't find git_filter_repo.py. Did you forget to make a symlink to git-filter-repo named git_filter_repo.py or did you forget to put the latter in your PYTHONPATH?")
svn_root_url = ""
svn_git_mappings = []
def parse_args():
"""
Parse and return arguments for this script.
Also do some argument sanity checks and adaptions.
"""
parser = argparse.ArgumentParser(
description="Add Git submodules according to svn:externals from .gitsvnextmodules. "
"As preparation for this conversion process, an analysis can be performed.")
parser.add_argument('--force', '-f', action='store_true',
help="Rewrite repository history even if the current repo does not "
"look like a fresh clone.")
parser.add_argument('--refs', nargs='+',
help="Limit history rewriting to the specified refs. Option is directly "
"forwarded to git-filter-repo, see there for details and caveats. "
"Use for debugging purposes only!")
parser.add_argument('--svn-root-url',
help="Root URL of the corresponding SVN repository, "
"needed for conversion of relative to absolute external URLs.")
analysis = parser.add_argument_group(title="Analysis")
analysis.add_argument('--analyze', action='store_true',
help="Analyze repository history and create auxiliary files for conversion process.")
analysis.add_argument('--report-dir', type=os.fsencode,
help="Directory to write report, defaults to GIT_DIR/filter-repo/svnexternals, "
"refuses to run if exists, --force delete existing dir first.")
conversion = parser.add_argument_group(title="Conversion")
conversion.add_argument('--svn-git-mapfiles', type=os.fsencode, nargs='+', metavar='MAPFILE',
help="Files with SVN-to-Git revision mappings for SVN externals conversion.")
args = parser.parse_args()
if args.analyze and args.svn_git_mapfiles:
raise SystemExit("Error: --svn-git-mapfiles makes no sense with --analyze.")
if not args.analyze and not args.svn_git_mapfiles:
raise SystemExit("Error: --svn-git-mapfiles is required for the conversion process.")
return args
def read_mappings(mapfiles):
"""
Read files with SVN-to-Git mappings and return a list of mappings from it.
"""
mappings = []
for mapfile in mapfiles:
with open(mapfile, "rb") as f:
for line in f:
line = line.rstrip(b'\r\n')
# Skip blank and comment lines
if not line or line.startswith(b'#'):
continue
# Convert to string for use with configparser later
line = line.decode()
# Parse the line
fields = line.split('\t', 4)
mapping = {'svn_url': fields[0],
'svn_rev': int(fields[1]),
'git_url': fields[2],
'git_commit': fields[3],
'state': fields[4]}
mappings.append(mapping)
return mappings
cat_file_process = None
def parse_config(blob_id):
"""
Create a configparser object for a .gitsvnextmodules/.gitmodules file from
its blob ID.
"""
parsed_config = configparser.ConfigParser()
if blob_id is not None:
# Get the blob contents
cat_file_process.stdin.write(blob_id + b'\n')
cat_file_process.stdin.flush()
objhash, objtype, objsize = cat_file_process.stdout.readline().split()
contents_plus_newline = cat_file_process.stdout.read(int(objsize)+1)
# Parse it
parsed_config.read_string(contents_plus_newline.decode())
return parsed_config
def create_blob(parsed_config):
"""
Create a filter-repo blob object from a .gitsvnextmodules/.gitmodules
configparser object according to Git config style.
"""
lines = []
for sec in parsed_config.sections():
lines.append("[" + sec + "]\n")
for opt in parsed_config.options(sec):
lines.append("\t" + opt + " = " + parsed_config[sec][opt] + "\n")
return fr.Blob(''.join(lines).encode())
def get_git_url(svn_url):
"""
Get the Git URL for a corresponding SVN URL.
"""
for entry in svn_git_mappings:
if entry['svn_url'] == svn_url:
return entry['git_url']
else:
return None
def get_git_commit_hash(svn_url, svn_rev):
"""
Get the Git commit hash for its corresponding SVN URL+revision.
The mapping is not restricted to the exact revision, but also uses the next
lower revision found. Needed when the revision was set to that of the root
URL instead of to that of the specific subdirectory (e.g. trunk). TortoiseSVN
behaves so when setting the external to HEAD.
"""
ent = None
rev = 0
for entry in svn_git_mappings:
if (entry['svn_url'] == svn_url
and entry['svn_rev'] <= svn_rev
and entry['svn_rev'] > rev):
ent = entry
rev = entry['svn_rev']
if ent is not None and ent['state'] == "commit":
return ent['git_commit']
else:
return None
def get_absolute_svn_url(svnext_url, svn_root_url):
"""
Convert a relative svn:externals URL to an absolute one.
If the format is unsupported, return the URL unchanged with success=False.
If no root URL is given or the URL is absolute already, return it unchanged.
In all cases, even if returned "unchanged", trailing slashes are removed.
"""
# Remove trailing slash(es)
svnext_url = svnext_url.rstrip("/")
svn_root_url = svn_root_url.rstrip("/")
# Normalize URLs in relative format
svn_root_parsed = urlsplit(svn_root_url)
if svnext_url.startswith(("../", "^/../")): # unsupported
return (False, svnext_url)
elif not svn_root_url:
pass # unchanged
elif svnext_url.startswith("^/"):
svnext_url = svn_root_url + svnext_url[1:]
elif svnext_url.startswith("//"):
svnext_url = svn_root_parsed.scheme + ":" + svnext_url
elif svnext_url.startswith("/"):
svnext_url = svn_root_parsed.scheme + "://" + svn_root_parsed.netloc + svnext_url
return True, svnext_url
def parse_revision_value(value):
"""
Parse the value of key 'revision' from a .gitsvnextmodules file and return it
as integer.
Used to handle non-numeric values like 1k, 2k, 3k etc. added by SubGit
instead of 1024, 2048, 3072 etc., likewise 1m, 2m, ..., 1g, ...
"""
suffix = value[-1]
if suffix in "kmg":
mult = {"k": 1024, "m": 1024**2, "g": 1024**3}
return int(value[0:-1]) * mult[suffix]
else:
return int(value)
def add_submodule_tree_entry(commit, parsed_config, section):
"""
Add a submodule entry to the tree of a Git commit.
SVN externals information obtained from parsed .gitsvnextmodules file.
"""
# Skip type=file (SVN file external), not possible as submodule
if parsed_config[section]['type'] != 'dir':
return False
success, svn_url = get_absolute_svn_url(parsed_config[section]['url'], svn_root_url)
# Skip unsupported URL format
if not success:
return False
# Get SVN revision
if parsed_config.has_option(section, 'revision'):
svn_rev = parse_revision_value(parsed_config[section]['revision'])
else:
# TODO: revision has to be guessed according to commit timestamp, skip for now
return False
# SVN url+revision mapping to Git commit
git_hash = get_git_commit_hash(svn_url, svn_rev)
# Skip missing or unusable mapping
if git_hash is None:
return False
git_hash = git_hash.encode()
dirname = parsed_config[section]['path'].encode()
# Add gitlink to tree
commit.file_changes.append(fr.FileChange(b'M', dirname, git_hash, b'160000'))
return True
def get_commit_map_path():
"""
Return path to commit-map file.
"""
git_dir = fr.GitUtils.determine_git_dir(b'.')
return os.path.join(git_dir, b'filter-repo', b'commit-map')
def parse_commit_map(commit_map_file):
"""
Parse commit-map file and return a dictionary.
"""
parsed_map = {}
with open(commit_map_file, "rb") as f:
for line in f:
line = line.rstrip(b'\r\n')
# Skip blank lines
if not line:
continue
# Store old/new commits, also the "old"/"new" header in the first line
old, new = line.split()
parsed_map[old] = new
return parsed_map
def merge_commit_maps(old_commit_map, new_commit_map):
"""
Merge old and new commit-map by omitting intermediate commits.
Return the merged dictionary.
"""
merged_map = {}
for (key, old_val) in old_commit_map.items():
new_val = new_commit_map[old_val] if old_val in new_commit_map else old_val
merged_map[key] = new_val
return merged_map
def write_commit_map(commit_map, commit_map_file):
"""
Write commit-map dictionary to file.
"""
with open(commit_map_file, 'wb') as f:
for (old, new) in commit_map.items():
f.write(b'%-40s %s\n' % (old, new))
def create_report_dir(args):
"""
Create the directory for analysis report.
"""
if args.report_dir:
reportdir = args.report_dir
else:
git_dir = fr.GitUtils.determine_git_dir(b'.')
# Create the report directory as necessary
results_tmp_dir = os.path.join(git_dir, b'filter-repo')
if not os.path.isdir(results_tmp_dir):
os.mkdir(results_tmp_dir)
reportdir = os.path.join(results_tmp_dir, b'svnexternals')
if os.path.isdir(reportdir):
if args.force:
sys.stdout.write("Warning: Removing recursively: \"%s\"" % fr.decode(reportdir))
shutil.rmtree(reportdir)
else:
sys.stdout.write("Error: dir already exists (use --force to delete): \"%s\"\n" % fr.decode(reportdir))
sys.exit(1)
os.mkdir(reportdir)
return reportdir
analysis = {'dir_ext_orig': [],
'dir_ext_abs': [],
'file_ext_orig': [],
'file_ext_abs': []}
def write_analysis(reportdir):
"""
Prepare analysis and write it to files in report directory.
"""
analysis['dir_ext_orig'].sort()
analysis['dir_ext_abs'].sort()
analysis['file_ext_orig'].sort()
analysis['file_ext_abs'].sort()
sys.stdout.write("Writing reports to %s..." % fr.decode(reportdir))
sys.stdout.flush()
with open(os.path.join(reportdir, b"dir-externals-original.txt"), 'wb') as f:
for url in analysis['dir_ext_orig']:
f.write(("%s\n" % url).encode())
with open(os.path.join(reportdir, b"dir-externals-absolute.txt"), 'wb') as f:
for url in analysis['dir_ext_abs']:
f.write(("%s\n" % url).encode())
with open(os.path.join(reportdir, b"file-externals-original.txt"), 'wb') as f:
for url in analysis['file_ext_orig']:
f.write(("%s\n" % url).encode())
with open(os.path.join(reportdir, b"file-externals-absolute.txt"), 'wb') as f:
for url in analysis['file_ext_abs']:
f.write(("%s\n" % url).encode())
sys.stdout.write("done.\n")
def analyze_externals(commit, metadata):
"""
Generate/extend analysis of SVN externals for a Git commit.
Used as filter-repo commit callback.
"""
for change in commit.file_changes:
if change.filename == b'.gitsvnextmodules' and change.type == b'M':
gitsvnextmodules = parse_config(change.blob_id)
for sec in gitsvnextmodules.sections():
url = gitsvnextmodules[sec]['url']
success, abs_url = get_absolute_svn_url(url, svn_root_url)
# List of svn:externals URLs, also add the URL to the absolute list if
# conversion was not successful
if gitsvnextmodules[sec]['type'] == 'dir':
if url not in analysis['dir_ext_orig']:
analysis['dir_ext_orig'].append(url)
if abs_url not in analysis['dir_ext_abs']:
analysis['dir_ext_abs'].append(abs_url)
else:
if url not in analysis['file_ext_orig']:
analysis['file_ext_orig'].append(url)
if abs_url not in analysis['file_ext_abs']:
analysis['file_ext_abs'].append(abs_url)
def insert_submodules(commit, metadata):
"""
Insert submodules for a Git commit.
Used as filter-repo commit callback.
Since .gitsvnextmodules just contains the svn:externals state for the given
commit, we cannot derive specific changes from that file.
So we can only add/modify the gitlinks according to .gitsvnextmodules
(without knowing whether adding a new or modifying an existing or even
"modifying" an unchanged submodule, but none of that really matters).
We do not have information about deleted externals, those will be handled in
a separate filter run afterwards.
The .gitmodules file however will already be correct in this function because
we don't need to know about specific changes to add, modify or delete it.
"""
for change in commit.file_changes:
if change.filename == b'.gitsvnextmodules' and change.type in (b'M', b'D'):
gitsvnextmodules = parse_config(change.blob_id)
gitmodules = configparser.ConfigParser()
# Add gitlinks to the tree and prepare .gitmodules file content
for sec in gitsvnextmodules.sections():
if add_submodule_tree_entry(commit, gitsvnextmodules, sec):
# Gitlink added
# -> Add this entry to .gitmodules as well
# Create the section name string manually, do not rely on
# .gitsvnextmodules to always use the proper section name.
sec_name = 'submodule "' + gitsvnextmodules[sec]['path'] + '"'
gitmodules[sec_name] = {}
# submodule.<name>.path
gitmodules[sec_name]['path'] = gitsvnextmodules[sec]['path']
# submodule.<name>.url
success, svn_url = get_absolute_svn_url(gitsvnextmodules[sec]['url'], svn_root_url)
git_url = get_git_url(svn_url)
if git_url is not None:
gitmodules[sec_name]['url'] = git_url
else:
# Abort, but this will not happen in practice, catched in
# add_submodule_tree_entry() via get_git_commit_hash() already.
raise SystemExit("Error: No Git URL found in mapping although a commit hash could be found.")
# Write blob and adapt tree for .gitmodules
if gitmodules.sections():
# Create a blob object from the content and add it to the tree.
blob = create_blob(gitmodules)
filter.insert(blob)
commit.file_changes.append(fr.FileChange(b'M', b'.gitmodules', blob.id, b'100644'))
else:
# Delete the file, even if a "git rm" of all submodules keeps it empty.
commit.file_changes.append(fr.FileChange(b'D', b'.gitmodules'))
def delete_submodules(commit, metadata):
"""
Delete submodules from a Git commit.
Used as filter-repo commit callback.
Delete all submodules (inserted in the previous filter run) without an entry
in .gitsvnextmodules, these were real deletions of externals, which couldn't
be detected before.
Only the tree entries have to be removed because the .gitmodules file is
already in correct state from previous filter run.
"""
for change in commit.file_changes:
if change.filename == b'.gitsvnextmodules' and change.type in (b'M', b'D'):
gitsvnextmodules = parse_config(change.blob_id)
# Search for all submodules in the tree
output = subprocess.check_output('git ls-tree -d -r -z'.split() + [commit.original_id])
for line in output.split(b'\x00'):
if not line:
continue
mode_objtype_objid, dirname = line.split(b'\t', 1)
mode, objtype, objid = mode_objtype_objid.split(b' ')
if mode == b'160000' and objtype == b'commit':
# Submodule found
# -> Delete it if there is no corresponding entry in
# .gitsvnextmodules, keep/reinsert it otherwise
for sec in gitsvnextmodules.sections():
if gitsvnextmodules[sec]['path'].encode() == dirname:
# Reinsert it, might have been deleted in previous commits
if add_submodule_tree_entry(commit, gitsvnextmodules, sec):
# And remove the config section because this external has been
# converted
gitsvnextmodules.remove_section(sec)
break
else:
# Delete it
commit.file_changes.append(fr.FileChange(b'D', dirname))
# Rewrite .gitsvnextmodules to contain the unhandled externals only,
# delete it if empty (all externals converted).
if gitsvnextmodules.sections():
# Create a blob object from the content and replace the original one.
blob = create_blob(gitsvnextmodules)
filter.insert(blob)
change.blob_id = blob.id
else:
if change.type == b'M':
# File became empty, delete it
commit.file_changes.append(fr.FileChange(b'D', b'.gitsvnextmodules'))
break # avoid endless for loop
#else:
# File was empty already, delete command already present in stream
my_args = parse_args()
# Use passed URL without trailing slash(es)
if my_args.svn_root_url:
svn_root_url = my_args.svn_root_url.rstrip("/")
# Arguments forwarded to filter-repo
extra_args = []
if my_args.force:
extra_args = ['--force']
if my_args.refs:
extra_args += ['--refs'] + my_args.refs
cat_file_process = subprocess.Popen(['git', 'cat-file', '--batch'],
stdin = subprocess.PIPE,
stdout = subprocess.PIPE)
if my_args.analyze:
# Analysis
reportdir = create_report_dir(my_args)
fr_args = fr.FilteringOptions.parse_args(['--dry-run']
+ extra_args)
filter = fr.RepoFilter(fr_args, commit_callback=analyze_externals)
filter.run()
write_analysis(reportdir)
else:
# Conversion
svn_git_mappings = read_mappings(my_args.svn_git_mapfiles)
# There are no references to commit hashes in commit messages because this
# script runs on a Git repository converted from a Subversion repository.
fr_args = fr.FilteringOptions.parse_args(['--preserve-commit-hashes',
'--preserve-commit-encoding',
'--replace-refs', 'update-no-add']
+ extra_args)
filter = fr.RepoFilter(fr_args, commit_callback=insert_submodules)
filter.run()
# Store commit-map after first run
first_commit_map = parse_commit_map(get_commit_map_path())
filter = fr.RepoFilter(fr_args, commit_callback=delete_submodules)
filter.run()
# Update commit-map after second run, based on original IDs
second_commit_map = parse_commit_map(get_commit_map_path())
merged_commit_map = merge_commit_maps(first_commit_map, second_commit_map)
write_commit_map(merged_commit_map, get_commit_map_path())
cat_file_process.stdin.close()
cat_file_process.wait()