#!/usr/local/bin/python

#
# Create svn changesets from cvs repository incrementally.  This script
# may be replaced by `cvsps'.  I don't know.
#

# $Id: cvsconv.py 134740 2011-10-10 04:33:51Z yasuoka $

import os
import rcsparse
import re
import string
import sys
import time

from hashlib import md5
from svn import core, fs, delta, repos

def usage():
    print >>sys.stderr, 'usage: cvs2svndump cvsroot [svnroot svnpath]' 

def main():
    if len(sys.argv) != 2 and len(sys.argv) != 4:
	usage()
	sys.exit(1)

    cvsroot = sys.argv[1]
    if len(sys.argv) == 4:
	svnroot = sys.argv[2]
	svnpath = sys.argv[3]
    else:
	svnroot = None
	svnpath = None

    do_incremental = False
    found_last_revision = False

    cvs = CvsConv(cvsroot, None)
    if svnroot is None:
	svn = SvnDumper()
    else:
	svn = SvnDumper(svnpath)
	try:
	    svn.load(svnroot)
	    if svn.last_rev is not None:
		do_incremental = True
		print >>sys.stderr, '** svn loaded revision r%d by %s' % \
		    (svn.last_rev, svn.last_author)
	except:
	    pass

    print >>sys.stderr, '** walk cvs tree'
    cvs.walk()

    svn.dump = True

    changesets = sorted(cvs.changesets)
    nchangesets = len(changesets)
    print >>sys.stderr, '** cvs has %d changeset' % (nchangesets)

    if nchangesets <= 0:
	sys.exit(0)

    # don't use last 1 hour for safety
    dmax = changesets[-1].max_time - 3600
    printOnce = False

    dec = 'iso-8859-1'
    enc = 'utf-8'

    for i, k in enumerate(changesets):
	if do_incremental and not found_last_revision:
	    if k.max_time == svn.last_date and k.author == svn.last_author:
		found_last_revision = True
	    continue

	if k.max_time > dmax:
	    continue

	if not printOnce:
	    print 'SVN-fs-dump-format-version: 2'
	    print ''
	    printOnce = True

	# parse the first file to get log
	finfo = k.revs[0]
	rcsfile = rcsparse.rcsfile(finfo[2])
	log = rcsfile.getlog(finfo[0]).decode(dec).encode(enc)

	revprops = str_prop('svn:author', k.author.decode(dec).encode(enc))
	revprops += str_prop('svn:date', svn_time(k.max_time))
	revprops += str_prop('svn:log', log)
	revprops += 'PROPS-END\n'

	print 'Revision-number: %d' % (i + 1)
	print 'Prop-content-length: %d' % (len(revprops))
	print 'Content-length: %d' % (len(revprops))
	print ''
	print revprops

	for f in k.revs:
	    rcsfile = rcsparse.rcsfile(f[2])
	    fileprops = ''
	    if os.access(f[2], os.X_OK):
		fileprops += str_prop('svn:executable', '*')
	    fileprops += 'PROPS-END\n'
	    filecont = rcs_expand_keyword(f[2], f[0])

	    md5sum = md5()
	    md5sum.update(filecont)

	    p = node_path(cvs.cvsroot, svnpath, f[2])
	    if f[3] == 'dead':
		if not svn.exists(p):
		    print >> sys.stderr, "Warning: remove '%s', but it does "\
			"not exist." % (p)
		    continue
		print 'Node-path: %s' % (p)
		print 'Node-kind: file'
		print 'Node-action: delete'
		print ''
		svn.remove(p)
		continue
	    elif not svn.exists(p):
		svn.add(p)
		print 'Node-path: %s' % (p)
		print 'Node-kind: file'
		print 'Node-action: add'
	    else:
		print 'Node-path: %s' % (p)
		print 'Node-kind: file'
		print 'Node-action: change'

	    print 'Prop-content-length: %d' % (len(fileprops))
	    print 'Text-content-length: %s' % (len(filecont))
	    print 'Text-content-md5: %s' % (md5sum.hexdigest())
	    print 'Content-length: %d' % (len(fileprops) + len(filecont))
	    print ''
	    print fileprops + filecont
	    print ''
    print >>sys.stderr, '** dumped'


MAX_COMMIT_SEC = 90

class ChangeSetKey:
    def __init__(self, branch, author, time, log):
	self.branch = branch
	self.author = author
	self.min_time = time
	self.max_time = time
	self.revs = []
	self.log_hash = 0
	h = 0
	for c in log:
	    h = 31 * h + ord(c)
	self.log_hash = h

    def __cmp__(self, anon):
	if isinstance(anon, ChangeSetKey):
	    ma = anon.max_time - self.max_time
	    mi = self.min_time - anon.min_time
	    if ma > MAX_COMMIT_SEC:
		return -ma
	    if mi > MAX_COMMIT_SEC:
		return mi
	    if cmp(self.log_hash, anon.log_hash) != 0 or \
		    cmp(self.branch, anon.branch) != 0 or \
		    cmp(self.author, anon.author):
		return mi if mi != 0 else -ma;
	    return 0
	return -1

    def merge(self, anon):
	self.max_time = max(self.max_time, anon.max_time)
	self.min_time = min(self.min_time, anon.min_time)

    def __hash__(self):
	return hash(self.branch + '/' + self.author) * 31 + self.log_hash;

class CvsConv:
    def __init__(self, cvsroot, module = None):
	self.cvsroot = cvsroot
	self.module = module
	self.changesets = dict()

    def walk(self):
	p = [self.cvsroot]
	if self.module is not None: p.append(self.module)
	path = reduce(os.path.join, p)

	for root, dirs, files in os.walk(path):
	    for f in files:
		if not f[-2:] == ',v': continue
		self.parse_file(root + os.sep + f)

    def parse_file(self, path):
	rcsfile=rcsparse.rcsfile(path)
	path_related = path[len(self.cvsroot) + 1:][:-2]
	branches = {'1': 'HEAD', '1.1.1': 'VENDOR' }
	have_111 = False
	for k,v in rcsfile.symbols.items():
	    r = v.split('.')
	    if len(r) == 3:
		branches[v] = 'VENDOR'
	    elif len(r) >= 3 and r[-2] == '0':
		z = reduce(lambda a, b: a + '.' + b, r[:-2] + r[-1:])
		branches[reduce(lambda a, b: a + '.' + b, r[:-2] + r[-1:])] = k

	# sort by time and revision
	revs = sorted(rcsfile.revs.items(), \
	    lambda a,b: cmp(a[1][1], b[1][1]) or cmp(b[1][0], a[1][0]))
	p = '0'
	novendor = False
	have_initial_revision = False;
	for k,v in revs:
	    r = k.split('.')
	    if len(r) == 4 and r[0] == '1' and r[1] == '1' and r[2] == '1' \
		    and r[3] == '1':
		if have_initial_revision:
		    continue
		if v[3] == 'dead':
		    continue
		have_initial_revision = True
	    elif len(r) == 4 and r[0] == '1' and r[1] == '1' and r[2] == '1':
		if novendor:
		    continue
	    elif len(r) == 2:
		if r[0] == '1' and r[1] == '1':
		    if have_initial_revision:
			continue
		    if v[3] == 'dead':
			continue
		    have_initial_revision = True
		elif r[0] == '1' and r[1] != '1':
		    novendor = True
	    else:
		# trunk only
		continue

	    b = reduce(lambda a, b: a + '.' + b, r[:-1])
	    a = ChangeSetKey(branches[b], v[2], v[1], rcsfile.getlog(v[0]))
	    try:
		c = self.changesets[a]
		del self.changesets[a]
		c.merge(a)
		a = c
	    except KeyError:
		pass
	    self.changesets[a] = a
	    self.changesets[a].revs.append([k, p, path, v[3]])
	    p = k


def svn_time(t):
    import time
    return time.strftime("%Y-%m-%dT%H:%M:%S.000000Z", time.gmtime(t))

def str_prop(k,v):
    return 'K %d\n%s\nV %d\n%s\n' % (len(k), k, len(v), v)

def node_path(r,n,p):
    if r.endswith('/'):
	r = r[:-1]
    path = p[:-2]
    p = path.split('/')
    if len(p) > 0 and p[-2] == 'Attic':
	path = string.join(p[:-2], '/') + '/' + p[-1]
    if path.startswith(r):
	path = path[len(r) + 1:]
    if n is None or len(n) == 0:
	return path
    return '%s/%s' % (n, path)

class SvnDumper:
    def __init__(self, root=''):
	self.root = root
	if self.root != '' and self.root[-1] == '/':
	    self.root = self.root[:-1]
	self.dirs = {}
	self.dirs[self.root] = {'dontdelete': 1}
	self.dump = False

    def exists(self, path):
	d = os.path.dirname(path)
	if not self.dirs.has_key(d):
	    return False
	return self.dirs[d].has_key(os.path.basename(path))

    def add(self, path):
	d = os.path.dirname(path)
	if not self.dirs.has_key(d):
	    self.mkdir(d)
	self.dirs[d][os.path.basename(path)] = 1

    def remove(self, path):
	d = os.path.dirname(path)
	if d == path:
	    return
	del self.dirs[d][os.path.basename(path)]
	self.rmdir(d)

    def rmdir(self, path):
	if len(self.dirs[path]) > 0:
	    return
	for r in self.dirs.keys():
	    if r != path and r.startswith(path + '/'):
		return
	if self.dump:
	    print 'Node-path: %s' % (path)
	    print 'Node-kind: dir'
	    print 'Node-action: delete'
	    print ''
	del self.dirs[path]
	d = os.path.dirname(path)
	if d == path or not self.dirs.has_key(d):
	    return
	self.rmdir(d)

    def mkdir(self, path):
	if not self.dirs.has_key(path):
	    d = os.path.dirname(path)
	    if d == path:
		return
	    self.mkdir(d)
	    if self.dump:
		print 'Node-path: %s' % (path)
		print 'Node-kind: dir'
		print 'Node-action: add'
		print ''
		print ''
	    self.dirs[path] = {}

    def load(self, repo_path):
	repo_path = core.svn_path_canonicalize(repo_path)
	repos_ptr = repos.open(repo_path)
	fs_ptr = repos.fs(repos_ptr)
	rev = fs.youngest_rev(fs_ptr)
	base_root = fs.revision_root(fs_ptr, 0)
	root = fs.revision_root(fs_ptr, rev)
	hist = fs.node_history(root, self.root)
	while hist is not None:
	    hist = fs.history_prev(hist,0)
	    dummy,rev = fs.history_location(hist)
	    d = fs.revision_prop(fs_ptr, rev, core.SVN_PROP_REVISION_DATE)
	    author = fs.revision_prop(fs_ptr, rev, \
		core.SVN_PROP_REVISION_AUTHOR)
	    if author == 'svnadmin':
		continue
	    self.last_author = author
	    self.last_date = core.svn_time_from_cstring(d) / 1000000
	    self.last_rev = rev
	    def authz_cb(root, path, pool):
		return 1
	    editor = SvnDumperEditor(self)
	    e_ptr, e_baton = delta.make_editor(editor)
	    repos.dir_delta(base_root, '', '', root, self.root, e_ptr, e_baton,
		authz_cb, 0, 1, 0, 0)
	    break

class SvnDumperEditor(delta.Editor):
    def __init__(self, dumper):
	self.dumper = dumper

    def add_file(self, path, *args):
	self.dumper.add(self.dumper.root + '/' + path)

    def add_directory(self, path, *args):
	self.dumper.mkdir(self.dumper.root + '/' + path)


# ----------------------------------------------------------------------
# RCS Keywords
# ----------------------------------------------------------------------
re_kw = re.compile(r".*?\$(Author|Date|Header|Id|OpenBSD|Log|Name|RCSfile|Revision|Source|State|Mdocdate)[\$:]")

RCS_KW_AUTHOR   = (1 << 0)
RCS_KW_DATE     = (1 << 1)
RCS_KW_LOG      = (1 << 2)
RCS_KW_NAME     = (1 << 3)
RCS_KW_RCSFILE  = (1 << 4)
RCS_KW_REVISION = (1 << 5)
RCS_KW_SOURCE   = (1 << 6)
RCS_KW_STATE    = (1 << 7)
RCS_KW_FULLPATH = (1 << 8)
RCS_KW_MDOCDATE = (1 << 9)

RCS_KW_ID       = (RCS_KW_RCSFILE | RCS_KW_REVISION | RCS_KW_DATE |
		   RCS_KW_AUTHOR | RCS_KW_STATE)
RCS_KW_HEADER   = (RCS_KW_ID | RCS_KW_FULLPATH)

rcs_expkw = {
    "Author":   RCS_KW_AUTHOR,
    "Date":     RCS_KW_DATE ,
    "Header":   RCS_KW_HEADER,
    "Id":       RCS_KW_ID,
    "OpenBSD":  RCS_KW_ID,
    "Log":      RCS_KW_LOG,
    "Name":     RCS_KW_NAME,
    "RCSfile":  RCS_KW_RCSFILE,
    "Revision": RCS_KW_REVISION,
    "Source":   RCS_KW_SOURCE,
    "State":    RCS_KW_STATE,
    "Mdocdate": RCS_KW_MDOCDATE
}

RCS_KWEXP_NONE    = (1 << 0)
RCS_KWEXP_NAME    = (1 << 1)    # include keyword name
RCS_KWEXP_VAL     = (1 << 2)    # include keyword value
RCS_KWEXP_LKR     = (1 << 3)    # include name of locker
RCS_KWEXP_OLD     = (1 << 4)    # generate old keyword string
RCS_KWEXP_ERR     = (1 << 5)    # mode has an error
RCS_KWEXP_DEFAULT = (RCS_KWEXP_NAME | RCS_KWEXP_VAL)
RCS_KWEXP_KVL     = (RCS_KWEXP_NAME | RCS_KWEXP_VAL | RCS_KWEXP_LKR)

def rcs_kflag_get(flags):
    if flags is None:
	return RCS_KWEXP_DEFAULT 
    fl = 0
    for fc in flags:
	if fc == 'k':
	    fl |= RCS_KWEXP_NAME
	elif fc == 'v':
	    fl |= RCS_KWEXP_VAL
	elif fc == 'l':
	    fl |= RCS_KWEXP_LKR
	elif fc == 'o':
	    if len(flags) != 1:
		fl |= RCS_KWEXP_ERR
	    fl |= RCS_KWEXP_OLD
	elif fc == 'b':
	    if len(flags) != 1:
		fl |= RCS_KWEXP_ERR
	    fl |= RCS_KWEXP_NONE
	else:
	    fl |= RCS_KWEXP_ERR
    return fl

def split_lines(buf):
    lines = []
    o = 0;
    while o < len(buf):
	try:
	    nl = string.index(buf, '\n', o)
	    if nl >= 0:
		lines.append(buf[o:nl + 1])
		o = nl + 1
	except:
	    break

    if o < len(buf):
	lines.append(buf[o:])

    return lines

def rcs_expand_keyword(filename, r):
    def trim_trailing_space(str):
	i = 0
	while i < len(str) and (str[-i - 1] == ' ' or str[-i - 1] == '\t'):
	    i = i + 1
	return str[:-i] if i > 0 else str

    rcs = rcsparse.rcsfile(filename)
    rev = rcs.revs[r]

    mode = rcs_kflag_get(rcs.expand)
    if (mode & (RCS_KWEXP_NONE | RCS_KWEXP_OLD)) != 0:
	return rcs.checkout(rev[0])

    s = logbuf = ''
    for line in split_lines(rcs.checkout(rev[0])):
	while True:
	    m = re_kw.match(line)
	    if m is None:
		break
	    if len(line) > m.end(1) and line[m.end(1)] == '$':
		dsign = m.end(1)
	    else:
		try:
		    dsign = string.index(line, '$', m.end(1))
		    if dsign < 0:
			break
		except:
		    break
	    prefix = line[:m.start(1)-1]
	    s += prefix
	    expbuf = ''
	    if (mode & RCS_KWEXP_NAME) != 0:
		expbuf += '$'
		expbuf += m.group(1)
		if (mode & RCS_KWEXP_VAL) != 0:
		    expbuf += ': '
	    if (mode & RCS_KWEXP_VAL) != 0:
		expkw = rcs_expkw[m.group(1)]
		if (expkw & RCS_KW_RCSFILE) != 0:
		    expbuf += filename \
			if (expkw & RCS_KW_FULLPATH) != 0 \
			else os.path.basename(filename)
		    expbuf += " "
		if (expkw & RCS_KW_REVISION) != 0:
		    expbuf += rev[0]
		    expbuf += " "
		if (expkw & RCS_KW_DATE) != 0:
		    expbuf += time.strftime("%Y/%m/%d %H:%M:%S ", \
			time.gmtime(rev[1]))
		if (expkw & RCS_KW_MDOCDATE) != 0:
		    d = time.gmtime(rev[1])
		    expbuf += time.strftime( \
			"%B%e %Y " if (d.tm_mday < 10) else "%B %e %Y ", d)
		if (expkw & RCS_KW_AUTHOR) != 0:
		    expbuf += rev[2]
		    expbuf += " "
		if (expkw & RCS_KW_STATE) != 0:
		    expbuf += rev[3]
		    expbuf += " "
		if (expkw & RCS_KW_LOG) != 0:
		    p = trim_trailing_space(prefix)
		    expbuf += filename \
			if (expkw & RCS_KW_FULLPATH) != 0 \
			else os.path.basename(filename)
		    expbuf += " "
		    logbuf += '%s Revision %s ' % (p, rev[0])
		    logbuf += time.strftime("%Y/%m/%d %H:%M:%S ",\
			time.gmtime(rev[1]))
		    logbuf +=  rev[2] + '\n'
		    for lline in split_lines(rcs.getlog(rev[0])):
			logbuf += p + ' ' + lline
		    logbuf += p + '\n'
		if (expkw & RCS_KW_SOURCE) != 0:
		    expbuf += filename
		    expbuf += " "
		if (expkw & RCS_KW_NAME) != 0:
		    expbuf += " "
	    if (mode & RCS_KWEXP_NAME) != 0:
		expbuf += '$'
	    s += expbuf[:255]
	    line = line[dsign + 1:]
	s += line
	if len(logbuf) > 0:
	    s += logbuf
	    logbuf = ''
    return s

# ----------------------------------------------------------------------
# entry point
# ----------------------------------------------------------------------
if __name__ == '__main__':
    main();
