This script merges data from a SD file into an IJC entity, based on a common key value. If that particular value is not found, a new row containing information is inserted.
/** Merge data from a SD file into an IJC entity. * * Usage: * 1. load the first SD file into IJC using the standard import(script is configured for file 'Building_blocks_GBP_fixed.sdf') * 2. add a text field named MpBp * 3. edit the variables in the 'edit these settings' section (script is configured for file '2004 Building Blocks Database.sdf') * 4. execute the script * * Note: the files used for the pre-configured settings can be found here: http://shared.chemaxon.com/projects/ijc/data/ * Note: the script will run much faster if there is an index on the column for the merge field (casno in this case). * Note: there is special handling of the values for the CASno field from the SD file because of quirks with the data * from this field. If you are using this for your own data then you will need to remove this custom processing. * * @author Tim Dudgeon ([email protected]) */ import chemaxon.formats.MolImporter import chemaxon.struc.Molecule import com.im.df.api.dml.* import com.im.commons.progress.* import com.im.df.api.support.* // ---------- edit these settings ---------------------------------------------------- String file = 'C:/data/structures/KeyOrganics/2004 Building Blocks Database.sdf' // the file to load String structureFieldName = 'Structure' // the name fo the structure field String mergePropName = 'CASno' // the name of the merge field in the file String mergeFieldName = 'casno' // the name of the merge property in the database // next two are a list of maps. key is the name in the file and value is the corresponding name of the field in the database Map insertValFieldNames = ['ID':'code', 'Purity':'purity'] // fields whose values are added only when inserting new row Map updateAndInsertValFieldNames = ['Mp/Bp':'MpBp'] // fields whose values are updated or inserted // ------------probably no need to edit anything below here --------------------------- MolImporter importer = new MolImporter(file) importer.grabbingEnabled = true Molecule mol = new Molecule() def parent = dataTree.rootVertex.entity def edp = parent.schema.dataProvider.getEntityDataProvider(parent) def structureField = parent.fields.items.find { it.name == structureFieldName } def mergeField = parent.fields.items.find { it.name == mergeFieldName } // find the fields - we need them later Map insertFields = [:] insertValFieldNames.each { k,v -> def fld = parent.fields.items.find { it.name == v } assert fld != null insertFields[k] = fld } Map updateAndInsertFields = [:] updateAndInsertValFieldNames.each { k,v -> def fld = parent.fields.items.find { it.name == v } assert fld != null updateAndInsertFields[k] = fld } def lock = edp.lockable.obtainLock('updating data') def env = EnvUtils.createDefaultEnvironmentRW(lock, 'updating data', true); int i = 1 int updates = 0 int inserts = 0 try { println "Reading file" while (importer.read(mol)) { String molStr = importer.grabbedMoleculeString String mergeVal = MPropHandler.convertToString(mol.properties(), mergePropName); // custom processing of CASno values // strangely the cas numbers are surrounded by square brackets in this file so we need to remove them if (mergeVal) { mergeVal = mergeVal.substring(1, mergeVal.length() - 1) } Map insertVals = [:] insertValFieldNames.each { k,v -> insertVals[k] = mol.getProperty(k) } Map updateAndInsertVals = [:] updateAndInsertValFieldNames.each { k,v -> updateAndInsertVals[k] = mol.getProperty(k) } boolean update = false DFTermExpression q = null if (mergeVal) { q = DFTermsFactory.createFieldOperatorValueExpr( Operators.EQUALS, mergeField, null, mergeVal) List ids = edp.queryForIds(q, SortDirective.EMPTY, env) if (ids) { update = true updates++ } else { inserts++ } } if (update) { Map vals = [:] // add the values updateAndInsertFields.each { k,v -> vals[v.id] = updateAndInsertVals[k] } DFUpdateDescription ud = DFUpdateDescription.create(parent, q, vals) Map<DFUpdateDescription, DFUpdateResult> result = edp.update([ud], DFUndoConfig.OFF, env) DFUpdateResult res = result[ud] println "$i updated ${res.changedRowsCount} rows" } else { // new map and add structure Map vals = [(structureField.id):molStr] // add the values for fields that are for insert only insertFields.each { k,v -> vals[v.id] = insertVals[k] } // and add the ones that are for both insert and update updateAndInsertFields.each { k,v -> vals[v.id] = updateAndInsertVals[k] } edp.insert(vals, null, env) println "$i inserted row" } i++ } } finally { importer.close() env?.feedback.finish() lock?.release() } println "Finished. $updates updates, $inserts inserts"
Versions: This script has been tested on IJC versions 5.6 and 5.7