ticket:197: 0001-Miscutil-Separate-extraction-of-fulltext-and-load-d.patch

File 0001-Miscutil-Separate-extraction-of-fulltext-and-load-d.patch, 6.6 KB (added by bthiell, 4 years ago)

Patch to separate loading the records and extracting the texts.

  • INSTALL

    From 9e021187b6444fa7352905fa795ef1914e8d03c8 Mon Sep 17 00:00:00 2001
    From: Benoit Thiell <bthiell@localhost.(none)>
    Date: Mon, 5 Jul 2010 09:08:37 -0400
    Subject: [PATCH] Miscutil: Separate extraction of fulltext and load-demo-records
    
    * Because the text extraction is a long process that is not needed every time
      one sets up a demo site, it has been separated from the default
      load-demo-records routine. It is now a separate inveniocfg option
      '--extract-text-from-records'.
    
    * It also makes the text extraction capabilities of Invenio more visible
      to the new user as the install guide encourages him to launch a command
      named '--extract-text-from-records'.
    ---
     INSTALL                                       |    7 ++++++
     modules/miscutil/lib/inveniocfg.py            |   29 ++++++++++++++++++++++--
     modules/miscutil/lib/testutils.py             |    3 +-
     modules/webhelp/web/hacking/test-suite.webdoc |    1 +
     4 files changed, 36 insertions(+), 4 deletions(-)
    
    diff --git a/INSTALL b/INSTALL
    index f38833c..4db6d75 100644
    a b Contents 
    241241      $ sudo /etc/init.d/apache2 restart 
    242242      $ sudo -u www-data /opt/cds-invenio/bin/inveniocfg --create-demo-site 
    243243      $ sudo -u www-data /opt/cds-invenio/bin/inveniocfg --load-demo-records 
     244      $ sudo -u www-data /opt/cds-invenio/bin/inveniocfg --extract-text-from-records 
    244245      $ sudo -u www-data /opt/cds-invenio/bin/inveniocfg --run-unit-tests 
    245246      $ sudo -u www-data /opt/cds-invenio/bin/inveniocfg --run-regression-tests 
    246247      $ sudo -u www-data /opt/cds-invenio/bin/inveniocfg --run-web-tests 
    Contents 
    606607          indexing and searching of your local CDS Invenio demo 
    607608          installation. 
    608609 
     610      $ sudo -u www-data /opt/cds-invenio/bin/inveniocfg --extract-text-from-records 
     611 
     612          Optionally, populate the fulltext index by extracting text from the 
     613          fulltext files. Also runs OCR (Optical Character Recognition) for  
     614          one document (record ID 97). 
     615 
    609616      $ sudo -u www-data /opt/cds-invenio/bin/inveniocfg --run-unit-tests 
    610617 
    611618          Optionally, you can run the unit test suite to verify the 
  • modules/miscutil/lib/inveniocfg.py

    diff --git a/modules/miscutil/lib/inveniocfg.py b/modules/miscutil/lib/inveniocfg.py
    index 8f3c6d8..dcf3b63 100644
    a b def cli_cmd_load_demo_records(conf): 
    691691    """Load demo records.  Useful for testing purposes.""" 
    692692    from invenio.config import CFG_PREFIX 
    693693    from invenio.dbquery import run_sql 
     694 
     695    from invenio.bibindex_engine import get_all_indexes 
     696    all_indexes = get_all_indexes() 
     697    # Run all indexes except fulltext index for speed reasons. 
     698    all_indexes.remove('fulltext') 
     699 
    694700    print ">>> Going to load demo records..." 
    695701    run_sql("TRUNCATE schTASK") 
    696702    for cmd in ["%s/bin/bibupload -u admin -i %s/var/tmp/demobibdata.xml" % (CFG_PREFIX, CFG_PREFIX), 
    697703                "%s/bin/bibupload 1" % CFG_PREFIX, 
    698                 "%s/bin/bibdocfile --textify --with-ocr --recid 97" % CFG_PREFIX, 
    699                 "%s/bin/bibdocfile --textify --all" % CFG_PREFIX, 
    700                 "%s/bin/bibindex -u admin" % CFG_PREFIX, 
     704                "%s/bin/bibindex -w %s -u admin" % (CFG_PREFIX, ','.join(all_indexes)), 
    701705                "%s/bin/bibindex 2" % CFG_PREFIX, 
    702706                "%s/bin/bibreformat -u admin -o HB" % CFG_PREFIX, 
    703707                "%s/bin/bibreformat 3" % CFG_PREFIX, 
    def cli_cmd_load_demo_records(conf): 
    710714            sys.exit(1) 
    711715    print ">>> Demo records loaded successfully." 
    712716 
     717def cli_cmd_extract_text_from_records(conf): 
     718    """Extracts the text from the records and populates the fulltext index.""" 
     719    from invenio.config import CFG_PREFIX 
     720    from invenio.dbquery import run_sql 
     721    print ">>> Going to extract text from demo records..." 
     722    run_sql("TRUNCATE schTASK") 
     723    # Use (slow) OCR only for one document for demonstration purpose. 
     724    for cmd in ["%s/bin/bibdocfile --textify --with-ocr --recid 97" % CFG_PREFIX, 
     725                "%s/bin/bibdocfile --textify --all" % CFG_PREFIX, 
     726                "%s/bin/bibindex -w fulltext -u admin" % CFG_PREFIX, 
     727                "%s/bin/bibindex 2" % CFG_PREFIX]: 
     728        if os.system(cmd): 
     729            print "ERROR: failed execution of", cmd 
     730            sys.exit(1) 
     731    print ">>> Texts extracted successfully." 
     732 
    713733def cli_cmd_remove_demo_records(conf): 
    714734    """Remove demo records.  Useful when you are finished testing.""" 
    715735    print ">>> Going to remove demo records..." 
    def main(): 
    11651185            elif opt == '--load-demo-records': 
    11661186                cli_cmd_load_demo_records(conf) 
    11671187                done = True 
     1188            elif opt == '--extract-text-from-records': 
     1189                cli_cmd_extract_text_from_records(conf) 
     1190                done = True 
    11681191            elif opt == '--remove-demo-records': 
    11691192                cli_cmd_remove_demo_records(conf) 
    11701193                done = True 
  • modules/miscutil/lib/testutils.py

    diff --git a/modules/miscutil/lib/testutils.py b/modules/miscutil/lib/testutils.py
    index e05ffb8..d8ed2d2 100644
    a b def warn_user_about_tests(test_suite_type='regression'): 
    7676**                                                                  ** 
    7777**    $ inveniocfg --drop-demo-site \                               ** 
    7878**                 --create-demo-site \                             ** 
    79 **                 --load-demo-records                              ** 
     79**                 --load-demo-records \                            ** 
     80**                 --extract-text-from-records \                    ** 
    8081**                                                                  ** 
    8182** Note that DOING THE ABOVE WILL ERASE YOUR ENTIRE DATABASE.       ** 
    8283**                                                                  ** 
  • modules/webhelp/web/hacking/test-suite.webdoc

    diff --git a/modules/webhelp/web/hacking/test-suite.webdoc b/modules/webhelp/web/hacking/test-suite.webdoc
    index 7474d2e..c4fa427 100644
    a b $ /opt/cds-invenio/bin/inveniocfg --run-regression-tests 
    450450**    $ inveniocfg --drop-demo-site \                               ** 
    451451**                 --create-demo-site \                             ** 
    452452**                 --load-demo-records                              ** 
     453**                 --extract-text-from-records                      ** 
    453454**                                                                  ** 
    454455** Note that DOING THE ABOVE WILL ERASE YOUR ENTIRE DATABASE.       ** 
    455456**                                                                  **