[minion~hg:88] It's ok if we get a negative position while searching.

  • From: stgreen@kenai.com
  • To: commits@minion.kenai.com
  • Subject: [minion~hg:88] It's ok if we get a negative position while searching.
  • Date: Thu, 28 Apr 2011 19:30:58 +0000

Project:    minion
Repository: hg
Revision:   88
Author:     stgreen
Date:       2011-04-04 00:47:18 UTC
Link:       

Log Message:
------------
Getting bigram dictionaries working for wildcard matches against saved fields.
Be more sane about what gets bigrammified.
It's ok if we get a negative position while searching.


Revisions:
----------
86
87
88


Modified Paths:
---------------
Minion/src/com/sun/labs/minion/indexer/DiskDictionaryBundle.java
Minion/src/com/sun/labs/minion/indexer/MemoryDictionaryBundle.java
Minion/src/com/sun/labs/minion/indexer/dictionary/DiskBiGramDictionary.java
Minion/src/com/sun/labs/minion/indexer/dictionary/DiskDictionary.java
Minion/src/com/sun/labs/minion/indexer/dictionary/MemoryBiGramDictionary.java
Minion/src/com/sun/labs/minion/indexer/dictionary/MemoryDictionary.java
Minion/src/com/sun/labs/minion/test/QueryTest.java
Minion/src/com/sun/labs/minion/indexer/DiskField.java
Minion/src/com/sun/labs/minion/indexer/partition/InvFileDiskPartition.java


Diffs:
------
diff -r 593b8eda9d7d -r 61833746ad29 
Minion/src/com/sun/labs/minion/indexer/DiskDictionaryBundle.java
--- a/Minion/src/com/sun/labs/minion/indexer/DiskDictionaryBundle.java  Fri 
Mar 25 17:16:31 2011 -0400
+++ b/Minion/src/com/sun/labs/minion/indexer/DiskDictionaryBundle.java  Sun 
Apr 03 16:36:49 2011 -0700
@@ -144,25 +144,35 @@
 
         if(header.tokenBGOffset > 0) {
             dictFile.seek(header.tokenBGOffset);
+            DiskDictionary tokenDict = 
+                    dicts[Type.UNCASED_TOKENS.ordinal()] != null ?
+                    dicts[Type.UNCASED_TOKENS.ordinal()] :
+                    dicts[Type.CASED_TOKENS.ordinal()];
             tokenBigrams = new DiskBiGramDictionary(dictFile, postIn[0],
                                                     
DiskDictionary.PostingsInputType.FILE_PART_POST,
                                                     
DiskDictionary.BufferType.FILEBUFFER,
                                                     256, 2048, 2048, 2048, 
2048,
                                                     null,
-                                                    
dicts[Type.UNCASED_TOKENS.
-                    ordinal()]);
+                                                    tokenDict);
             tokenBigrams.setPartition(field.partition);
         }
 
         if(header.savedBGOffset > 0) {
             dictFile.seek(header.savedBGOffset);
+            DiskDictionary savedValueDict = 
+                    dicts[Type.UNCASED_SAVED.ordinal()] != null ?
+                    dicts[Type.UNCASED_SAVED.ordinal()] :
+                    dicts[Type.RAW_SAVED.ordinal()];
+                    
             savedBigrams = new DiskBiGramDictionary(dictFile, postIn[0],
                                                     
DiskDictionary.PostingsInputType.FILE_PART_POST,
                                                     
DiskDictionary.BufferType.FILEBUFFER,
                                                     256, 2048, 2048, 2048, 
2048,
                                                     null,
-                                                    dicts[Type.UNCASED_SAVED.
-                    ordinal()]);
+                                                    savedValueDict);
+            logger.info(String.format("field: %s", info.getName()));
+            logger.info(String.format("savedBG header: %s", 
savedBigrams.getHeader()));
+            logger.info(String.format("savedUN header: %s", 
savedValueDict.getHeader()));
             savedBigrams.setPartition(field.partition);
         }
 
@@ -415,23 +425,23 @@
             return null;
         }
         QueryEntry[] qes;
-        if(caseSensitive) {
+        if (caseSensitive) {
             qes = dicts[Type.RAW_SAVED.ordinal()].getMatching(savedBigrams, 
val,
-                                                              true,
-                                                              maxEntries,
-                                                              timeLimit);
+                    true,
+                    maxEntries,
+                    timeLimit);
         } else {
-            if(dicts[Type.UNCASED_SAVED.ordinal()] == null) {
+            if (dicts[Type.UNCASED_SAVED.ordinal()] == null) {
                 logger.warning(String.format(
                         "Can't get uncased matches for string field %s",
                         info.getName()));
                 return null;
             }
-            qes = 
dicts[Type.UNCASED_SAVED.ordinal()].getMatching(savedBigrams, val.
-                    toLowerCase(),
-                                                                  false,
-                                                                  maxEntries,
-                                                                  timeLimit);
+            qes = 
dicts[Type.UNCASED_SAVED.ordinal()].getMatching(savedBigrams,
+                    val.toLowerCase(),
+                    false,
+                    maxEntries,
+                    timeLimit);
         }
         return new ArrayDictionaryIterator(qes);
     }

diff -r 593b8eda9d7d -r 61833746ad29 
Minion/src/com/sun/labs/minion/indexer/MemoryDictionaryBundle.java
--- a/Minion/src/com/sun/labs/minion/indexer/MemoryDictionaryBundle.java      
  Fri Mar 25 17:16:31 2011 -0400
+++ b/Minion/src/com/sun/labs/minion/indexer/MemoryDictionaryBundle.java      
  Sun Apr 03 16:36:49 2011 -0700
@@ -3,6 +3,7 @@
 
 import com.sun.labs.minion.FieldInfo;
 import com.sun.labs.minion.indexer.dictionary.DateNameHandler;
+import com.sun.labs.minion.indexer.dictionary.DictionaryWriter;
 import com.sun.labs.minion.indexer.dictionary.DoubleNameHandler;
 import com.sun.labs.minion.indexer.dictionary.LongNameHandler;
 import com.sun.labs.minion.indexer.dictionary.MemoryBiGramDictionary;
@@ -30,6 +31,7 @@
 import java.util.Arrays;
 import java.util.Date;
 import java.util.List;
+import java.util.logging.Level;
 import java.util.logging.Logger;
 
 /**
@@ -116,7 +118,9 @@
     /**
      * An array of the sets of entries saved per document at indexing time.
      */
-    private List[] dv = new List[128];
+    private List[] dv;
+    
+    private List[] ucdv;
 
     private EntryFactory vectorEntryFactory = new 
EntryFactory(Postings.Type.ID_FREQ);
 
@@ -152,10 +156,12 @@
         }
 
         if(field.saved) {
+            dv = new List[128];
             dicts[Type.RAW_SAVED.ordinal()] = new 
MemoryDictionary<N>(savedEntryFactory);
             if(field.uncased) {
                 dicts[Type.UNCASED_SAVED.ordinal()] = new 
MemoryDictionary<N>(
                         savedEntryFactory);
+                ucdv = new List[128];
             }
         }
 
@@ -260,40 +266,38 @@
             return;
         }
 
-        IndexEntry savedEntry = null;
-
-        if(info.getType() == FieldInfo.Type.STRING) {
-            if((!field.uncased && !field.cased) || field.cased) {
-                savedEntry = dicts[Type.RAW_SAVED.ordinal()].put(name);
+        //
+        // We'll just store the saved entries per-document, since we might 
be adding
+        // entries in non-document ID order (e.g., when doing 
classification.)
+        // We'll build the actual postings lists at dump time.
+        if (dicts[Type.RAW_SAVED.ordinal()] != null) {
+            IndexEntry rawSavedEntry = 
dicts[Type.RAW_SAVED.ordinal()].put(name);
+            if (docID >= dv.length) {
+                dv = Arrays.copyOf(dv, (docID + 1) * 2);
             }
 
-            if(field.uncased) {
-                IndexEntry uce = 
dicts[Type.UNCASED_SAVED.ordinal()].put(CharUtils.
-                        toLowerCase(
-                        name.toString()));
-
-                //
-                // If there was no cased version saved, we'll keep the 
uncased version.
-                if(savedEntry == null) {
-                    savedEntry = uce;
-                }
+            if (dv[docID] == null) {
+                dv[docID] = new ArrayList<IndexEntry>();
             }
-
-        } else {
-            savedEntry = dicts[Type.RAW_SAVED.ordinal()].put(name);
+            dv[docID].add(rawSavedEntry);
         }
 
         //
-        // We'll just store the entries per-document, since we might be 
adding
-        // entries in non-document ID order.  We'll build the actual postings
-        // lists at dump time.
-        if(docID >= dv.length) {
-            dv = Arrays.copyOf(dv, (docID+1) * 2);
+        // Handle the uncased values for string fields.
+        if (dicts[Type.UNCASED_SAVED.ordinal()] != null) {
+            IndexEntry uncasedSavedEntry = 
dicts[Type.UNCASED_SAVED.ordinal()].
+                    put(CharUtils.toLowerCase(
+                    name.toString()));
+            if (docID >= ucdv.length) {
+                ucdv = Arrays.copyOf(ucdv, (docID + 1) * 2);
+            }
+
+            if (ucdv[docID] == null) {
+                ucdv[docID] = new ArrayList<IndexEntry>();
+            }
+            ucdv[docID].add(uncasedSavedEntry);
         }
-        if(dv[docID] == null) {
-            dv[docID] = new ArrayList<IndexEntry>();
-        }
-        dv[docID].add(savedEntry);
+
     }
 
     /**
@@ -383,9 +387,9 @@
                     // in document ID order.
                     Occurrence uo = new OccurrenceImpl();
                     for(int i = 0; i < dv.length; i++) {
-                        if(dv[i] != null) {
+                        if(ucdv[i] != null) {
                             uo.setID(i);
-                            for(IndexEntry e : (List<IndexEntry>) dv[i]) {
+                            for(IndexEntry e : (List<IndexEntry>) ucdv[i]) {
                                 e.add(uo);
                             }
                         }
@@ -424,18 +428,20 @@
                 tbg.add(e.getName().toString(), e.getID());
             }
             header.tokenBGOffset = fieldDictFile.getFilePointer();
-            tbg.dump(path, new StringNameHandler(), fieldDictFile, postOut,
-                     MemoryDictionary.Renumber.RENUMBER,
-                     MemoryDictionary.IDMap.NONE,
-                     null);
+            tbg.dump(path, new StringNameHandler(),
+                    fieldDictFile, postOut,
+                    MemoryDictionary.Renumber.RENUMBER,
+                    MemoryDictionary.IDMap.NONE,
+                    null);
         } else {
             header.tokenBGOffset = -1;
         }
 
         if(field.info.getType() == FieldInfo.Type.STRING) {
             IndexEntry[] sortedSaved =
-                    sortedEntries[Type.UNCASED_SAVED.ordinal()] != null ? 
sortedEntries[Type.UNCASED_SAVED.
-                    ordinal()] : sortedEntries[Type.RAW_SAVED.ordinal()];
+                    sortedEntries[Type.UNCASED_SAVED.ordinal()] != null ? 
+                    sortedEntries[Type.UNCASED_SAVED.ordinal()] : 
+                    sortedEntries[Type.RAW_SAVED.ordinal()];
             if(sortedSaved != null) {
                 MemoryBiGramDictionary sbg = new MemoryBiGramDictionary(
                         new EntryFactory(Postings.Type.ID_FREQ));
@@ -443,10 +449,16 @@
                     sbg.add(e.getName().toString(), e.getID());
                 }
                 header.savedBGOffset = fieldDictFile.getFilePointer();
+//                Logger dl = 
Logger.getLogger(MemoryDictionary.class.getName());
+//                Logger wl = 
Logger.getLogger(DictionaryWriter.class.getName());
+//                dl.setLevel(Level.FINE);
+//                wl.setLevel(Level.FINE);
                 sbg.dump(path, new StringNameHandler(), fieldDictFile, 
postOut,
                          MemoryDictionary.Renumber.NONE,
                          MemoryDictionary.IDMap.NONE,
                          null);
+//                dl.setLevel(Level.INFO);
+//                wl.setLevel(Level.INFO);
             } else {
                 header.savedBGOffset = -1;
             }

diff -r 593b8eda9d7d -r 61833746ad29 
Minion/src/com/sun/labs/minion/indexer/dictionary/DiskBiGramDictionary.java
--- 
a/Minion/src/com/sun/labs/minion/indexer/dictionary/DiskBiGramDictionary.java 
      Fri Mar 25 17:16:31 2011 -0400
+++ 
b/Minion/src/com/sun/labs/minion/indexer/dictionary/DiskBiGramDictionary.java 
      Sun Apr 03 16:36:49 2011 -0700
@@ -30,7 +30,6 @@
 import java.util.Collections;
 import java.util.Iterator;
 import java.util.List;
-import com.sun.labs.minion.indexer.postings.io.PostingsOutput;
 
 import com.sun.labs.minion.retrieval.ArrayGroup;
 import com.sun.labs.minion.retrieval.ScoredGroup;

diff -r 593b8eda9d7d -r 61833746ad29 
Minion/src/com/sun/labs/minion/indexer/dictionary/DiskDictionary.java
--- a/Minion/src/com/sun/labs/minion/indexer/dictionary/DiskDictionary.java   
  Fri Mar 25 17:16:31 2011 -0400
+++ b/Minion/src/com/sun/labs/minion/indexer/dictionary/DiskDictionary.java   
  Sun Apr 03 16:36:49 2011 -0700
@@ -714,6 +714,7 @@
         //
         // Bounds check.
         if(posn < 0 || posn >= dh.size) {
+            logger.info(String.format("posn: %d size: %d", posn, dh.size));
             return null;
         }
 
@@ -870,6 +871,8 @@
         //
         // First, get the matching entry IDs from the bigram dictionary.
         int[] entryIds = biDict.getMatching(pat);
+        
+        logger.info(String.format("candidates: %s", 
Arrays.toString(entryIds)));
 
         if(qtt.timedOut) {
             // Operation timed out
@@ -912,6 +915,7 @@
                     (!qtt.timedOut) &&
                     ((maxEntries <= 0) || (res.size() < maxEntries)); i++) {
                 QueryEntry curr = getByID(entryIds[i]);
+                logger.info(String.format("%d: %s", entryIds[i], curr));
                 if(Util.match(patArray, curr.toString().toCharArray(),
                               caseSensitive)) {
                     res.add(curr);

diff -r 593b8eda9d7d -r 61833746ad29 
Minion/src/com/sun/labs/minion/indexer/dictionary/MemoryBiGramDictionary.java
--- 
a/Minion/src/com/sun/labs/minion/indexer/dictionary/MemoryBiGramDictionary.java
     Fri Mar 25 17:16:31 2011 -0400
+++ 
b/Minion/src/com/sun/labs/minion/indexer/dictionary/MemoryBiGramDictionary.java
     Sun Apr 03 16:36:49 2011 -0700
@@ -24,9 +24,7 @@
 
 package com.sun.labs.minion.indexer.dictionary;
 
-import com.sun.labs.minion.util.CharUtils;
 
-import com.sun.labs.minion.indexer.entry.Entry;
 import com.sun.labs.minion.indexer.entry.EntryFactory;
 import com.sun.labs.minion.indexer.entry.IndexEntry;
 
@@ -34,8 +32,6 @@
 
 public class MemoryBiGramDictionary extends MemoryDictionary {
     
-    protected static String logTag = "MBG";
-    
     /**
      * Creates a bigram dictionary.  Such a dictionary will be populated a
      * term at a time.

diff -r 593b8eda9d7d -r 61833746ad29 
Minion/src/com/sun/labs/minion/indexer/dictionary/MemoryDictionary.java
--- a/Minion/src/com/sun/labs/minion/indexer/dictionary/MemoryDictionary.java 
  Fri Mar 25 17:16:31 2011 -0400
+++ b/Minion/src/com/sun/labs/minion/indexer/dictionary/MemoryDictionary.java 
  Sun Apr 03 16:36:49 2011 -0700
@@ -341,7 +341,7 @@
             IDMap idMapType,
             int[] postIDMap)
             throws java.io.IOException {
-        logger.fine("Dumping: " + map.size() + " entries");
+        logger.fine(String.format("Dumping %d entries", map.size()));
 
         //
         // Get a writer for the dictionary.  If we're not renumbering, it
@@ -373,7 +373,10 @@
             //
             // Write it out.
             if(entry.writePostings(postOut, postIDMap) == true) {
+                logger.fine(String.format("wrote postings for %s, %d entries 
in list", entry.getName(), entry.getN()));
                 dw.write(entry);
+            } else {
+                logger.fine(String.format("No postings for %s", 
entry.getName()));
             }
         }
 
@@ -387,6 +390,8 @@
         //
         // Write the final dictionary.
         dw.finish(dictFile);
+        
+        logger.fine(String.format("header:\n%s", dw.dh));
 
         return sorted;
     }

diff -r 593b8eda9d7d -r 61833746ad29 
Minion/src/com/sun/labs/minion/test/QueryTest.java
--- a/Minion/src/com/sun/labs/minion/test/QueryTest.java        Fri Mar 25 
17:16:31 2011 -0400
+++ b/Minion/src/com/sun/labs/minion/test/QueryTest.java        Sun Apr 03 
16:36:49 2011 -0700
@@ -725,10 +725,11 @@
                     for(DiskField df :
                             ((InvFileDiskPartition) p).getDiskFields()) {
                         output.format("Field: %s\n", df.getInfo().getName());
+                        output.flush();
                         List<QueryEntry> entries = df.getMatching(pat,
                                                                   
wildCaseSensitive,
                                                                   -1, -1);
-                        if(entries.size() == 0) {
+                        if(entries.isEmpty()) {
                             output.println("No matches");
                         } else {
                             output.format("%d matches\n", entries.size());


diff -r 61833746ad29 -r ae83b73f5591 
Minion/src/com/sun/labs/minion/indexer/DiskDictionaryBundle.java
--- a/Minion/src/com/sun/labs/minion/indexer/DiskDictionaryBundle.java  Sun 
Apr 03 16:36:49 2011 -0700
+++ b/Minion/src/com/sun/labs/minion/indexer/DiskDictionaryBundle.java  Sun 
Apr 03 17:41:22 2011 -0700
@@ -36,6 +36,7 @@
 import java.nio.channels.FileChannel;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Collections;
 import java.util.List;
 import java.util.PriorityQueue;
 import java.util.logging.Logger;
@@ -145,9 +146,9 @@
         if(header.tokenBGOffset > 0) {
             dictFile.seek(header.tokenBGOffset);
             DiskDictionary tokenDict = 
-                    dicts[Type.UNCASED_TOKENS.ordinal()] != null ?
-                    dicts[Type.UNCASED_TOKENS.ordinal()] :
-                    dicts[Type.CASED_TOKENS.ordinal()];
+                    dicts[Type.CASED_TOKENS.ordinal()] != null ?
+                    dicts[Type.CASED_TOKENS.ordinal()] :
+                    dicts[Type.UNCASED_TOKENS.ordinal()];
             tokenBigrams = new DiskBiGramDictionary(dictFile, postIn[0],
                                                     
DiskDictionary.PostingsInputType.FILE_PART_POST,
                                                     
DiskDictionary.BufferType.FILEBUFFER,
@@ -159,20 +160,12 @@
 
         if(header.savedBGOffset > 0) {
             dictFile.seek(header.savedBGOffset);
-            DiskDictionary savedValueDict = 
-                    dicts[Type.UNCASED_SAVED.ordinal()] != null ?
-                    dicts[Type.UNCASED_SAVED.ordinal()] :
-                    dicts[Type.RAW_SAVED.ordinal()];
-                    
             savedBigrams = new DiskBiGramDictionary(dictFile, postIn[0],
                                                     
DiskDictionary.PostingsInputType.FILE_PART_POST,
                                                     
DiskDictionary.BufferType.FILEBUFFER,
                                                     256, 2048, 2048, 2048, 
2048,
                                                     null,
-                                                    savedValueDict);
-            logger.info(String.format("field: %s", info.getName()));
-            logger.info(String.format("savedBG header: %s", 
savedBigrams.getHeader()));
-            logger.info(String.format("savedUN header: %s", 
savedValueDict.getHeader()));
+                                                    
dicts[Type.RAW_SAVED.ordinal()]);
             savedBigrams.setPartition(field.partition);
         }
 
@@ -285,7 +278,33 @@
         }
         return ret;
     }
+    
+    public List<QueryEntry> getWildcardMatches(String pat, boolean 
caseSensitive, 
+            int maxEntries, long timeLimit) {
+        
+        DiskDictionary tokenDict = 
+                dicts[Type.CASED_TOKENS.ordinal()] != null ?
+                dicts[Type.CASED_TOKENS.ordinal()] :
+                dicts[Type.UNCASED_TOKENS.ordinal()];
+        
+        if(tokenDict == null) {
+            return Collections.EMPTY_LIST;
+        }
+        
+        QueryEntry[] qes = tokenDict.getMatching(
+                tokenBigrams, pat,
+                caseSensitive,
+                maxEntries,
+                timeLimit);
 
+        if(qes == null) {
+            return Collections.EMPTY_LIST;
+        }
+        
+        return Arrays.asList(qes);
+        
+    }
+    
     public QueryEntry getStem(String stem) {
         QueryEntry ret = null;
         if(dicts[Type.STEMMED_TOKENS.ordinal()] != null) {
@@ -424,25 +443,11 @@
                     getName()));
             return null;
         }
-        QueryEntry[] qes;
-        if (caseSensitive) {
-            qes = dicts[Type.RAW_SAVED.ordinal()].getMatching(savedBigrams, 
val,
-                    true,
-                    maxEntries,
-                    timeLimit);
-        } else {
-            if (dicts[Type.UNCASED_SAVED.ordinal()] == null) {
-                logger.warning(String.format(
-                        "Can't get uncased matches for string field %s",
-                        info.getName()));
-                return null;
-            }
-            qes = 
dicts[Type.UNCASED_SAVED.ordinal()].getMatching(savedBigrams,
-                    val.toLowerCase(),
-                    false,
-                    maxEntries,
-                    timeLimit);
-        }
+        QueryEntry[] qes = dicts[Type.RAW_SAVED.ordinal()].getMatching(
+                savedBigrams, val,
+                caseSensitive,
+                maxEntries,
+                timeLimit);
         return new ArrayDictionaryIterator(qes);
     }
 
@@ -471,8 +476,10 @@
 
             DictionaryIterator di = getMatchingIterator(pattern, 
caseSensitive,
                                                         maxEntries, 
timeLimit);
-            while(di.hasNext()) {
-                ret.add((QueryEntry) di.next());
+            if (di != null) {
+                while (di.hasNext()) {
+                    ret.add((QueryEntry) di.next());
+                }
             }
         }
         return ret;

diff -r 61833746ad29 -r ae83b73f5591 
Minion/src/com/sun/labs/minion/indexer/DiskField.java
--- a/Minion/src/com/sun/labs/minion/indexer/DiskField.java     Sun Apr 03 
16:36:49 2011 -0700
+++ b/Minion/src/com/sun/labs/minion/indexer/DiskField.java     Sun Apr 03 
17:41:22 2011 -0700
@@ -67,6 +67,13 @@
     public QueryEntry getTerm(int id, boolean caseSensitive) {
         return bundle.getTerm(id, caseSensitive);
     }
+    
+    public List<QueryEntry> getWildcardMatches(String name, boolean 
caseSensitive,
+            int maxEntries,
+            long timeLimit) {
+        return bundle.getWildcardMatches(name, caseSensitive, maxEntries,
+                timeLimit);
+    }
 
     public TermStatsImpl getTermStats(String name) {
         return partition.getPartitionManager().getTermStats(name, info);

diff -r 61833746ad29 -r ae83b73f5591 
Minion/src/com/sun/labs/minion/indexer/MemoryDictionaryBundle.java
--- a/Minion/src/com/sun/labs/minion/indexer/MemoryDictionaryBundle.java      
  Sun Apr 03 16:36:49 2011 -0700
+++ b/Minion/src/com/sun/labs/minion/indexer/MemoryDictionaryBundle.java      
  Sun Apr 03 17:41:22 2011 -0700
@@ -417,15 +417,15 @@
         // If we have tokens or saved values, then output any bigrams that we
         // need for accelerating wildcards.
         IndexEntry[] sortedTokens = 
-                sortedEntries[Type.UNCASED_TOKENS.ordinal()] != null ?
-                sortedEntries[Type.UNCASED_TOKENS.ordinal()] :
-                sortedEntries[Type.CASED_TOKENS.ordinal()];
+                sortedEntries[Type.CASED_TOKENS.ordinal()] != null ?
+                sortedEntries[Type.CASED_TOKENS.ordinal()] :
+                sortedEntries[Type.UNCASED_TOKENS.ordinal()];
         
         if(sortedTokens != null) {
             MemoryBiGramDictionary tbg = new MemoryBiGramDictionary(
                     new EntryFactory(Postings.Type.ID_FREQ));
             for(IndexEntry e : sortedTokens) {
-                tbg.add(e.getName().toString(), e.getID());
+                tbg.add(CharUtils.toLowerCase(e.getName().toString()), 
e.getID());
             }
             header.tokenBGOffset = fieldDictFile.getFilePointer();
             tbg.dump(path, new StringNameHandler(),
@@ -438,30 +438,19 @@
         }
 
         if(field.info.getType() == FieldInfo.Type.STRING) {
-            IndexEntry[] sortedSaved =
-                    sortedEntries[Type.UNCASED_SAVED.ordinal()] != null ? 
-                    sortedEntries[Type.UNCASED_SAVED.ordinal()] : 
-                    sortedEntries[Type.RAW_SAVED.ordinal()];
-            if(sortedSaved != null) {
-                MemoryBiGramDictionary sbg = new MemoryBiGramDictionary(
-                        new EntryFactory(Postings.Type.ID_FREQ));
-                for(IndexEntry e : sortedSaved) {
-                    sbg.add(e.getName().toString(), e.getID());
-                }
-                header.savedBGOffset = fieldDictFile.getFilePointer();
-//                Logger dl = 
Logger.getLogger(MemoryDictionary.class.getName());
-//                Logger wl = 
Logger.getLogger(DictionaryWriter.class.getName());
-//                dl.setLevel(Level.FINE);
-//                wl.setLevel(Level.FINE);
-                sbg.dump(path, new StringNameHandler(), fieldDictFile, 
postOut,
-                         MemoryDictionary.Renumber.NONE,
-                         MemoryDictionary.IDMap.NONE,
-                         null);
-//                dl.setLevel(Level.INFO);
-//                wl.setLevel(Level.INFO);
-            } else {
-                header.savedBGOffset = -1;
+            MemoryBiGramDictionary sbg = new MemoryBiGramDictionary(
+                    new EntryFactory(Postings.Type.ID_FREQ));
+            for (IndexEntry e : sortedEntries[Type.RAW_SAVED.ordinal()]) {
+                sbg.add(CharUtils.toLowerCase(e.getName().toString()),
+                        e.getID());
             }
+            header.savedBGOffset = fieldDictFile.getFilePointer();
+            sbg.dump(path, new StringNameHandler(), fieldDictFile, postOut,
+                    MemoryDictionary.Renumber.NONE,
+                    MemoryDictionary.IDMap.NONE,
+                    null);
+        } else {
+            header.savedBGOffset = -1;
         }
 
       

diff -r 61833746ad29 -r ae83b73f5591 
Minion/src/com/sun/labs/minion/indexer/dictionary/DiskBiGramDictionary.java
--- 
a/Minion/src/com/sun/labs/minion/indexer/dictionary/DiskBiGramDictionary.java 
      Sun Apr 03 16:36:49 2011 -0700
+++ 
b/Minion/src/com/sun/labs/minion/indexer/dictionary/DiskBiGramDictionary.java 
      Sun Apr 03 17:41:22 2011 -0700
@@ -21,7 +21,6 @@
  * Park, CA 94025 or visit www.sun.com if you need additional
  * information or have any questions.
  */
-
 package com.sun.labs.minion.indexer.dictionary;
 
 import java.io.RandomAccessFile;
@@ -55,29 +54,29 @@
     private DiskDictionary mainDict;
 
     public DiskBiGramDictionary(RandomAccessFile dictFile,
-                                 RandomAccessFile postFile,
-                                 PostingsInputType postInType,
-                                 BufferType fileBufferType,
-                                 int cacheSize,
-                                 int nameBufferSize,
-                                 int offsetsBufferSize,
-                                 int infoBufferSize,
-                                 int infoOffsetsBufferSize,
-                                 Partition part,
-                                 DiskDictionary mainDict)
+            RandomAccessFile postFile,
+            PostingsInputType postInType,
+            BufferType fileBufferType,
+            int cacheSize,
+            int nameBufferSize,
+            int offsetsBufferSize,
+            int infoBufferSize,
+            int infoOffsetsBufferSize,
+            Partition part,
+            DiskDictionary mainDict)
             throws java.io.IOException {
         super(new EntryFactory(Type.ID_FREQ),
-              new StringNameHandler(),
-              dictFile,
-              new RandomAccessFile[]{postFile},
-              postInType,
-              fileBufferType,
-              cacheSize,
-              nameBufferSize,
-              offsetsBufferSize,
-              infoBufferSize,
-              infoOffsetsBufferSize,
-              part);
+                new StringNameHandler(),
+                dictFile,
+                new RandomAccessFile[]{postFile},
+                postInType,
+                fileBufferType,
+                cacheSize,
+                nameBufferSize,
+                offsetsBufferSize,
+                infoBufferSize,
+                infoOffsetsBufferSize,
+                part);
         this.mainDict = mainDict;
     } // DiskBiGramDictionary constructor
 
@@ -116,11 +115,11 @@
      * tested.
      */
     public int[] getMatching(String wc,
-                              boolean starts,
-                              boolean ends) {
+            boolean starts,
+            boolean ends) {
         //
         // Quick sanity check.
-        if(wc.length() == 0 || size() == 0) {
+        if (wc.length() == 0 || size() == 0) {
             return null;
         }
 
@@ -142,7 +141,7 @@
         // calculating bigrams will depend on whether the string has to be
         // anchored at one end of the pattern or another.
         int b, e;
-        if(starts) {
+        if (starts) {
             bg[0] = (char) 0;
             b = 0;
         } else {
@@ -151,20 +150,19 @@
         }
 
         int l = wc.length();
-        if(ends) {
+        if (ends) {
             e = l;
         } else {
             e = l - 1;
         }
 
-        for(int i = b; i <= e; i++) {
+        for (int i = b; i <= e; i++) {
 
             bg[1] = (ends && i == e) ? (char) 0 : wc.charAt(i);
 
             //
             // If there's any wildcard character, we can't have a bigram!
-            if(bg[0] == '*' || bg[0] == '?' ||
-                    bg[1] == '*' || bg[1] == '?') {
+            if (bg[0] == '*' || bg[0] == '?' || bg[1] == '*' || bg[1] == 
'?') {
                 bg[0] = bg[1];
                 continue;
             }
@@ -175,7 +173,7 @@
             // dictionary we're associated with won't contain any entry
             // whose name contains this bigram!
             Entry bigram = get(new String(bg));
-            if(bigram == null) {
+            if (bigram == null) {
                 return null;
             }
             bigrams.add(bigram);
@@ -188,7 +186,7 @@
 
         //
         // If we have bigrams, then make our ID list.
-        if(bigrams.size() > 0) {
+        if (bigrams.size() > 0) {
             ag = intersect(bigrams);
         } else {
 
@@ -196,14 +194,14 @@
             // Pull out any unigrams and use those.
             char[] ug = new char[wc.length()];
             int nu = 0;
-            for(int i = 0; i < wc.length(); i++) {
+            for (int i = 0; i < wc.length(); i++) {
                 char c = wc.charAt(i);
-                if(c != '*' && c != '?') {
+                if (c != '*' && c != '?') {
                     ug[nu++] = c;
                 }
             }
 
-            if(nu == 0) {
+            if (nu == 0) {
                 //
                 // No unigrams, no bigrams.  It's all wildcards, so we need 
to check
                 // everything.
@@ -217,7 +215,7 @@
             // all the occurrences of the unigram.  We'll union together
             // the postings for these and then intersect the resulting
             // unions.
-            for(int i = 0; i < nu; i++) {
+            for (int i = 0; i < nu; i++) {
 
                 //
                 // Get this character and the next greater one.
@@ -225,7 +223,7 @@
                 String upper = Character.toString((char) ((int) ug[i] + 1));
                 ArrayGroup curr = getUnigrams(lower, upper);
 
-                if(ag == null) {
+                if (ag == null) {
                     ag = curr;
                 } else {
                     ag = ag.intersect(curr);
@@ -233,7 +231,7 @@
 
                 //
                 // If we ever drop to zero size, we're done.
-                if(ag.getSize() == 0) {
+                if (ag.getSize() == 0) {
                     return null;
                 }
             }
@@ -243,7 +241,7 @@
 
         //
         // At this point, no hits means no matches.
-        if(ret.length == 0) {
+        if (ret.length == 0) {
             return null;
         }
         return ret;
@@ -264,7 +262,7 @@
     public int[] getAllVariants(String wc, boolean allowPartial) {
         //
         // Quick sanity check.
-        if(wc.length() == 0 || size() == 0) {
+        if (wc.length() == 0 || size() == 0) {
             return null;
         }
 
@@ -285,7 +283,7 @@
         // bigrams.
         bg[0] = wc.charAt(0);
 
-        for(int i = 1; i < wc.length(); i++) {
+        for (int i = 1; i < wc.length(); i++) {
 
             bg[1] = wc.charAt(i);
 
@@ -296,8 +294,8 @@
             // dictionary we're associated with won't contain any entry
             // whose name contains this bigram!
             Entry bigram = get(new String(bg));
-            if(bigram == null) {
-                if(allowPartial) {
+            if (bigram == null) {
+                if (allowPartial) {
                     bg[0] = bg[1];
                     continue;
                 } else {
@@ -314,7 +312,7 @@
 
         //
         // If we have bigrams, then make our ID list.
-        if(bigrams.size() > 0) {
+        if (bigrams.size() > 0) {
             ag = union(bigrams);
         } else {
 
@@ -324,7 +322,7 @@
         }
 
         int[] ret = ag.getDocs();
-        if(ret.length == 0) {
+        if (ret.length == 0) {
             return null;
         }
         return ret;
@@ -343,9 +341,9 @@
         Collections.sort(entries, new EntrySizeComparator());
         ArrayGroup ag = null;
         PostingsIteratorFeatures feat = new PostingsIteratorFeatures();
-        for(Iterator i = entries.iterator(); i.hasNext();) {
+        for (Iterator i = entries.iterator(); i.hasNext();) {
             QueryEntry e = (QueryEntry) i.next();
-            if(ag == null) {
+            if (ag == null) {
                 ag = new ArrayGroup(e.iterator(feat));
             } else {
                 ag = ag.destructiveIntersect(e.iterator(feat));
@@ -366,14 +364,14 @@
         // Throw all the entries into a ScoredQuickOr, possibly tossing
         // out entries below a particular score?
         ScoredQuickOr qor = new ScoredQuickOr((DiskPartition) part,
-                                              mainDict.getMaxID());
+                mainDict.getMaxID());
         PostingsIteratorFeatures feat = new PostingsIteratorFeatures();
-        for(Iterator it = entries.iterator(); it.hasNext();) {
+        for (Iterator it = entries.iterator(); it.hasNext();) {
             QueryEntry e = (QueryEntry) it.next();
             qor.add(e.iterator(feat));
         }
         ScoredGroup sg = (ScoredGroup) qor.getGroup();
-        if(sg.getSize() >= 1000) {
+        if (sg.getSize() >= 1000) {
             sg.sort(true);
             float nthScore = sg.getScore(1000);
             sg.discardBelow(nthScore);
@@ -386,27 +384,27 @@
      * the bigrams that have that character as a first character.
      */
     protected ArrayGroup getUnigrams(String lower,
-                                      String upper) {
+            String upper) {
         DictionaryIterator di = iterator(lower, true,
-                                         upper, false);
+                upper, false);
 
         int[] ids = new int[di.estimateSize()];
         int p = 0;
         PostingsIteratorFeatures feat = new PostingsIteratorFeatures();
-        while(di.hasNext()) {
+        while (di.hasNext()) {
             PostingsIterator pi = ((QueryEntry) di.next()).iterator(feat);
-            if(pi == null) {
+            if (pi == null) {
                 continue;
             }
-            while(pi.next()) {
+            while (pi.next()) {
                 ids[p++] = pi.getID();
             }
         }
         java.util.Arrays.sort(ids, 0, p);
         int size = 0;
         int prev = -1;
-        for(int i = 0; i < p; i++) {
-            if(ids[i] != prev) {
+        for (int i = 0; i < p; i++) {
+            if (ids[i] != prev) {
                 ids[size++] = ids[i];
             }
             prev = ids[i];
@@ -414,10 +412,10 @@
         return new ArrayGroup(ids, size);
     }
 
-    public static void merge(MergeState mergeState, 
-            DiskBiGramDictionary[] dicts ) throws java.io.IOException {
+    public static void merge(MergeState mergeState,
+            DiskBiGramDictionary[] dicts) throws java.io.IOException {
         DiskDictionary.merge(
-                mergeState.manager.getIndexDir(), 
+                mergeState.manager.getIndexDir(),
                 new StringNameHandler(),
                 (DiskDictionary[]) dicts,
                 null,

diff -r 61833746ad29 -r ae83b73f5591 
Minion/src/com/sun/labs/minion/indexer/dictionary/DiskDictionary.java
--- a/Minion/src/com/sun/labs/minion/indexer/dictionary/DiskDictionary.java   
  Sun Apr 03 16:36:49 2011 -0700
+++ b/Minion/src/com/sun/labs/minion/indexer/dictionary/DiskDictionary.java   
  Sun Apr 03 17:41:22 2011 -0700
@@ -714,7 +714,9 @@
         //
         // Bounds check.
         if(posn < 0 || posn >= dh.size) {
-            logger.info(String.format("posn: %d size: %d", posn, dh.size));
+            if(posn < 0) {
+            logger.log(Level.INFO, String.format("posn: %d size: %d", posn, 
dh.size), new Exception("Negative posn?"));
+            }
             return null;
         }
 
@@ -870,16 +872,14 @@
 
         //
         // First, get the matching entry IDs from the bigram dictionary.
-        int[] entryIds = biDict.getMatching(pat);
+        int[] candidateEntryIDs = biDict.getMatching(pat);
         
-        logger.info(String.format("candidates: %s", 
Arrays.toString(entryIds)));
-
         if(qtt.timedOut) {
             // Operation timed out
             return new QueryEntry[0];
         }
 
-        if(entryIds == null) {
+        if(candidateEntryIDs == null) {
             // There's nothing that could match
             return null;
         }
@@ -895,7 +895,7 @@
 
         //
         // Now check the entry IDs.
-        if(entryIds.length == 0) {
+        if(candidateEntryIDs.length == 0) {
             // There's no bigrams or unigrams to narrow down by.
             // Try everything:
             Iterator entryIt = iterator();
@@ -911,11 +911,10 @@
             //
             // Now look up each entry and see if it matches the
             // result.
-            for(int i = 0; (i < entryIds.length) && (entryIds[i] != 0) &&
+            for(int i = 0; (i < candidateEntryIDs.length) && 
(candidateEntryIDs[i] != 0) &&
                     (!qtt.timedOut) &&
                     ((maxEntries <= 0) || (res.size() < maxEntries)); i++) {
-                QueryEntry curr = getByID(entryIds[i]);
-                logger.info(String.format("%d: %s", entryIds[i], curr));
+                QueryEntry curr = getByID(candidateEntryIDs[i]);
                 if(Util.match(patArray, curr.toString().toCharArray(),
                               caseSensitive)) {
                     res.add(curr);

diff -r 61833746ad29 -r ae83b73f5591 
Minion/src/com/sun/labs/minion/indexer/partition/InvFileDiskPartition.java
--- 
a/Minion/src/com/sun/labs/minion/indexer/partition/InvFileDiskPartition.java  
      Sun Apr 03 16:36:49 2011 -0700
+++ 
b/Minion/src/com/sun/labs/minion/indexer/partition/InvFileDiskPartition.java  
      Sun Apr 03 17:41:22 2011 -0700
@@ -239,7 +239,7 @@
             return f.fetchOne(docID);
         }
     }
-
+    
     /**
      * Gets an iterator for all of the values in a field.
      *

diff -r 61833746ad29 -r ae83b73f5591 
Minion/src/com/sun/labs/minion/test/QueryTest.java
--- a/Minion/src/com/sun/labs/minion/test/QueryTest.java        Sun Apr 03 
16:36:49 2011 -0700
+++ b/Minion/src/com/sun/labs/minion/test/QueryTest.java        Sun Apr 03 
17:41:22 2011 -0700
@@ -686,7 +686,7 @@
                 output.println("model:" + currModel.toString());
                 String[] labels = currModel.getSenseLabels();
                 for(String l : labels) {
-                    logger.info("label: " + l);
+                    logger.info(String.format("label: %s", l));
                 }
             } catch(Exception e) {
                 System.err.println("Exception during disambiguation");
@@ -724,6 +724,41 @@
                 for(DiskPartition p : manager.getActivePartitions()) {
                     for(DiskField df :
                             ((InvFileDiskPartition) p).getDiskFields()) {
+                        
+                        if(!df.getInfo().hasAttribute(
+                                FieldInfo.Attribute.TOKENIZED)) {
+                            continue;
+                        }
+                        output.format("Field: %s\n", df.getInfo().getName());
+                        List<QueryEntry> entries = df.getWildcardMatches(pat,
+                                                                  
wildCaseSensitive,
+                                                                  -1, -1);
+                        if(entries.isEmpty()) {
+                            output.println("No matches");
+                        } else {
+                            output.format("%d token matches\n", 
entries.size());
+                            for(QueryEntry e : entries) {
+                                output.format(" %s (%d)\n", e.getName(),
+                                              e.getN());
+                            }
+                        }
+                    }
+                }
+
+            } catch(Exception e) {
+                logger.log(Level.SEVERE, "Exception in :wild", e);
+                return 0;
+            }
+        } else if(q.startsWith(":fwild ")) {
+
+            try {
+                String pat = q.substring(q.indexOf(' ') + 1).trim();
+                for(DiskPartition p : manager.getActivePartitions()) {
+                    for(DiskField df :
+                            ((InvFileDiskPartition) p).getDiskFields()) {
+                        if(df.getInfo().getType() !=  FieldInfo.Type.STRING) 
{
+                            continue;
+                        }
                         output.format("Field: %s\n", df.getInfo().getName());
                         output.flush();
                         List<QueryEntry> entries = df.getMatching(pat,
@@ -732,9 +767,8 @@
                         if(entries.isEmpty()) {
                             output.println("No matches");
                         } else {
-                            output.format("%d matches\n", entries.size());
+                            output.format("%d saved value matches\n", 
entries.size());
                             for(QueryEntry e : entries) {
-                                output.println("");
                                 output.format(" %s (%d)\n", e.getName(),
                                               e.getN());
                             }


diff -r ae83b73f5591 -r 5b1430509004 
Minion/src/com/sun/labs/minion/indexer/DiskDictionaryBundle.java
--- a/Minion/src/com/sun/labs/minion/indexer/DiskDictionaryBundle.java  Sun 
Apr 03 17:41:22 2011 -0700
+++ b/Minion/src/com/sun/labs/minion/indexer/DiskDictionaryBundle.java  Sun 
Apr 03 17:47:18 2011 -0700
@@ -39,6 +39,7 @@
 import java.util.Collections;
 import java.util.List;
 import java.util.PriorityQueue;
+import java.util.logging.Level;
 import java.util.logging.Logger;
 
 /**
@@ -443,6 +444,7 @@
                     getName()));
             return null;
         }
+
         QueryEntry[] qes = dicts[Type.RAW_SAVED.ordinal()].getMatching(
                 savedBigrams, val,
                 caseSensitive,

diff -r ae83b73f5591 -r 5b1430509004 
Minion/src/com/sun/labs/minion/indexer/dictionary/DiskDictionary.java
--- a/Minion/src/com/sun/labs/minion/indexer/dictionary/DiskDictionary.java   
  Sun Apr 03 17:41:22 2011 -0700
+++ b/Minion/src/com/sun/labs/minion/indexer/dictionary/DiskDictionary.java   
  Sun Apr 03 17:47:18 2011 -0700
@@ -714,9 +714,6 @@
         //
         // Bounds check.
         if(posn < 0 || posn >= dh.size) {
-            if(posn < 0) {
-            logger.log(Level.INFO, String.format("posn: %d size: %d", posn, 
dh.size), new Exception("Negative posn?"));
-            }
             return null;
         }
 






[minion~hg:88] It's ok if we get a negative position while searching.

stgreen 04/28/2011
  • Mysql
  • Glassfish
  • Jruby
  • Rails
  • Nblogo
Terms of Use; Privacy Policy;
© 2010, Oracle Corporation and/or its affiliates
(revision 20120518.3c65429)
 
 
Close
loading
Please Confirm
Close