resuming feature

git-svn-id: https://wikiteam.googlecode.com/svn/trunk@20 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95
13 years ago · 1519058872
parent cb12252aaf
commit 1519058872
1 changed files with 83 additions and 54 deletions
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@ -57,7 +57,7 @@ def cleanHTML(raw=''):
        sys.exit()
    return raw

-def getTitles(config={}):
+def getTitles(config={}, start='!'):
    #Get page titles parsing Special:Allpages or using API (fix)
    #
    #http://en.wikipedia.org/wiki/Special:AllPages
@ -150,7 +150,7 @@ def getXMLPage(config={}, title=''):
        params['offset'] = '1'
        params['limit'] = limit
    data = urllib.urlencode(params)
-    req = urllib2.Request(url=domain, data=data, headers=headers)
+    req = urllib2.Request(url=config['domain'], data=data, headers=headers)
    f = urllib2.urlopen(req)
    xml = f.read()

@ -161,7 +161,7 @@ def getXMLPage(config={}, title=''):
        while not truncated and params['offset']:
            params['offset'] = re.findall(r_timestamp, xml)[-1] #get the last timestamp from the acum XML
            data = urllib.urlencode(params)
-            req2 = urllib2.Request(url=domain, data=data, headers=headers)
+            req2 = urllib2.Request(url=config['domain'], data=data, headers=headers)
            f2 = urllib2.urlopen(req2)
            xml2 = f2.read()
            if re.findall(r_timestamp, xml2): #are there more edits in this next XML chunk?
@ -183,19 +183,41 @@ def cleanXML(xml=''):
    xml = xml.split('</mediawiki>')[0]
    return xml

-def generateXMLDump(config={}, titles=[]):
-    print 'Retrieving the XML for every page'
-    header = getXMLHeader(domain=config['domain'])
+def generateXMLDump(config={}, titles=[], start=''):
+    print 'Retrieving the XML for every page from "%s"' % (start and start or 'start')
+    header = getXMLHeader(config=config)
    footer = '</mediawiki>\n' #new line at the end
-    xmlfilename = '%s/%s-%s-%s.xml' % (config['path'], domain2prefix(domain=domain), config['curonly'] and 'current' or 'history', datetime.datetime.now().strftime('%Y%m%d'))
-    xmlfile = open('%s/%s' (config['path'], xmlfilename), 'w')
-    xmlfile.write(header)
+    xmlfilename = '%s-%s-%s.xml' % (domain2prefix(domain=config['domain']), config['date'], config['curonly'] and 'current' or 'history')
+    xmlfile = ''
+    lock = True
+    if start:
+        #remove the last chunk of xml dump (it is probably incomplete)
+        xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'r')
+        xml = xmlfile.read()
+        xmlfile.close()
+        xml = xml.split('<title>%s</title>' % (start))[0]
+        xml = '\n'.join(xml.split('\n')[:-2]) # [:-1] removing <page>\n tag
+        xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w')
+        xmlfile.write('%s\n' % (xml))
+        xmlfile.close()
+    else:
+        #requested complete xml dump
+        lock = False
+        xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w')
+        xmlfile.write(header)
+        xmlfile.close()
+    
+    xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a')
    c = 1
    for title in titles:
+        if title == start: #start downloading from start, included
+            lock = False
+        if lock:
+            continue
        delay(config=config)
        if c % 10 == 0:
            print '    Downloaded %d pages' % (c)
-        xml = getXMLPage(config={}, title=title)
+        xml = getXMLPage(config=config, title=title)
        xml = cleanXML(xml=xml)
        xmlfile.write(xml)
        c += 1
@ -205,8 +227,8 @@ def generateXMLDump(config={}, titles=[]):

 def saveTitles(config={}, titles=[]):
    #save titles in a txt for resume if needed
-    titlesfilename = '%s/%s-titles-%s.txt' % (config['path'], domain2prefix(domain=domain), datetime.datetime.now().strftime('%Y%m%d'))
-    titlesfile = open(titlesfilename, 'w')
+    titlesfilename = '%s-%s-titles.txt' % (domain2prefix(domain=config['domain']), config['date'])
+    titlesfile = open('%s/%s' % (config['path'], titlesfilename), 'w')
    titles.append('--END--')
    titlesfile.write('\n'.join(titles))
    titlesfile.close()
@ -324,6 +346,7 @@ def getParameters():
    domain = 'http://osl2.uca.es/wikira/index.php'
    config = {
        'curonly': False,
+        'date': datetime.datetime.now().strftime('%Y%m%d'),
        'domain': domain,
        'images': False,
        'logs': False,
@ -394,8 +417,8 @@ Write --help for help."""
        sys.exit()
        #usage()
    
-    #calculating path
-    config['path'] = './%s-dump-%s' % (domain2prefix(domain=config['domain']), datetime.datetime.now().strftime('%Y%m%d'))
+    #calculating path, if not defined by user with --path=
+    config['path'] = './%s-%s-wikidump' % (domain2prefix(domain=config['domain']), config['date'])
    
    return config, other

@ -414,28 +437,26 @@ def main():
    #creating path or resuming if desired
    c = 2
    originalpath = config['path'] # to avoid concat blabla-2, blabla-2-3, and so on...
-    while os.path.isdir(config['path']):
+    while not other['resume'] and os.path.isdir(config['path']): #do not enter if resume is request from begining
        print '\nWarning!: "%s" path exists' % (config['path'])
        reply = raw_input('There is a dump in "%s", probably incomplete.\nIf you choose resume, to avoid conflicts, the parameters you have chosen in the current session will be ignored\nand the parameters available in "%s/%s" will be loaded.\nDo you want to resume ([yes, y], otherwise no)? ' % (config['path'], config['path'], configfilename))
        if reply.lower() in ['yes', 'y']:
-            if os.path.isfile('%s/%s' % (config['path'], configfilename)):
-                print 'Loading config file...'
-                config = loadConfig(config=config, configfilename=configfilename)
-            else:
-                print 'No config file found. Aborting.'
+            if not os.path.isfile('%s/%s' % (config['path'], configfilename)):
+                print 'No config file found. I can\'t resume. Aborting.'
                sys.exit()
            print 'You have selected YES'
-            print 'OK, resuming...'
            other['resume'] = True
            break
        else:
            print 'You have selected NO'
-            print 'Trying generating a new dump into a new directory...'
        config['path'] = '%s-%d' % (originalpath, c)
        print 'Trying "%s"...' % (config['path'])
        c += 1

-    if not other['resume']:
+    if other['resume']:
+        print 'Loading config file...'
+        config = loadConfig(config=config, configfilename=configfilename)
+    else:
        os.mkdir(config['path'])
        saveConfig(config=config, configfilename=configfilename)
    
@ -443,42 +464,50 @@ def main():
    #fix, hacer que se pueda resumir la lista de títulos por donde iba (para wikis grandes)
    titles = []
    if other['resume']:
-        #load titles
-        #search last
-        last = 'lastline'
-        if last == '--END--':
-            #titles list is complete
-            pass
-            lastlinexml = 'aaa'
-            if lastlinexml == '</mediawiki>\n':
+        print 'Resuming previous dump process...'
+        if config['xml']:
+            #load titles
+            f = open('%s/%s-%s-titles.txt' % (config['path'], domain2prefix(domain=config['domain']), config['date']), 'r')
+            raw = f.read()
+            titles = raw.split('\n')
+            lasttitle = titles[-1]
+            f.close()
+            if lasttitle == '--END--':
+                #titles list is complete
+                print 'Titles list was completed in the previous session'
+            else:
+                #start = last
+                #remove complete namespaces and then getTitles(config=config, start=start)
+                #titles += getTitles(config=config, start=last)
+                print 'Titles list is incomplete. Resuming...'
+                #search last
+                last = 'lastline'
+                titles += getTitles(config=config, start='!') #fix, try resume not reload entirely, change start='!' and develop the feature into getTitles()
+                saveTitles(config=config, titles=titles)
+            #checking xml dump
+            f = open('%s/%s-%s-%s.xml' % (config['path'], domain2prefix(domain=config['domain']), config['date'], config['curonly'] and 'current' or 'history'), 'r')
+            xml = f.read()
+            f.close()
+            if re.findall('</mediawiki>', xml):
                #xml dump is complete
-                pass
-                lastimage = 'pepito'
-                if lastimage == 'aaaa':
-                    #image dump complete
-                    pass
-                    lastlog = 'aaaa'
-                    if lastlog == 'loquesea':
-                        #log dump complete
-                        pass
-                    else:
-                        #resume log
-                        pass
-                else:
-                    #resume images
-                    pass
+                print 'XML dump was completed in the previous session'
            else:
-                #resume xml dump
-                pass
-        else:
-            #start = last
-            #remove complete namespaces and then getTitles(config=config, start=start)
-            #titles += getTitles(config=config, start=last)
+                xmltitles = re.findall(r'<title>([^<]+)</title>', xml)
+                lastxmltitle = ''
+                if xmltitles:
+                    lastxmltitle = xmltitles[-1]
+                generateXMLDump(config=config, titles=titles, start=lastxmltitle)
+        
+        if config['images']:
+            pass
+        
+        if config['logs']:
            pass
    else:
-        #titles += getTitles(config=config, start='!')
-        #saveTitles(config=config, titles=titles)
+        print 'Trying generating a new dump into a new directory...'
        if config['xml']:
+            titles += getTitles(config=config, start='!')
+            saveTitles(config=config, titles=titles)
            generateXMLDump(config=config, titles=titles)
        if config['images']:
            generateImageDump(config=config)