[minion~hg:88] It's ok if we get a negative position while searching.
- From: stgreen@kenai.com
- To: commits@minion.kenai.com
- Subject: [minion~hg:88] It's ok if we get a negative position while searching.
- Date: Thu, 28 Apr 2011 19:30:58 +0000
Project: minion
Repository: hg
Revision: 88
Author: stgreen
Date: 2011-04-04 00:47:18 UTC
Link:
Log Message:
------------
Getting bigram dictionaries working for wildcard matches against saved fields.
Be more sane about what gets bigrammified.
It's ok if we get a negative position while searching.
Revisions:
----------
86
87
88
Modified Paths:
---------------
Minion/src/com/sun/labs/minion/indexer/DiskDictionaryBundle.java
Minion/src/com/sun/labs/minion/indexer/MemoryDictionaryBundle.java
Minion/src/com/sun/labs/minion/indexer/dictionary/DiskBiGramDictionary.java
Minion/src/com/sun/labs/minion/indexer/dictionary/DiskDictionary.java
Minion/src/com/sun/labs/minion/indexer/dictionary/MemoryBiGramDictionary.java
Minion/src/com/sun/labs/minion/indexer/dictionary/MemoryDictionary.java
Minion/src/com/sun/labs/minion/test/QueryTest.java
Minion/src/com/sun/labs/minion/indexer/DiskField.java
Minion/src/com/sun/labs/minion/indexer/partition/InvFileDiskPartition.java
Diffs:
------
diff -r 593b8eda9d7d -r 61833746ad29
Minion/src/com/sun/labs/minion/indexer/DiskDictionaryBundle.java
--- a/Minion/src/com/sun/labs/minion/indexer/DiskDictionaryBundle.java Fri
Mar 25 17:16:31 2011 -0400
+++ b/Minion/src/com/sun/labs/minion/indexer/DiskDictionaryBundle.java Sun
Apr 03 16:36:49 2011 -0700
@@ -144,25 +144,35 @@
if(header.tokenBGOffset > 0) {
dictFile.seek(header.tokenBGOffset);
+ DiskDictionary tokenDict =
+ dicts[Type.UNCASED_TOKENS.ordinal()] != null ?
+ dicts[Type.UNCASED_TOKENS.ordinal()] :
+ dicts[Type.CASED_TOKENS.ordinal()];
tokenBigrams = new DiskBiGramDictionary(dictFile, postIn[0],
DiskDictionary.PostingsInputType.FILE_PART_POST,
DiskDictionary.BufferType.FILEBUFFER,
256, 2048, 2048, 2048,
2048,
null,
-
dicts[Type.UNCASED_TOKENS.
- ordinal()]);
+ tokenDict);
tokenBigrams.setPartition(field.partition);
}
if(header.savedBGOffset > 0) {
dictFile.seek(header.savedBGOffset);
+ DiskDictionary savedValueDict =
+ dicts[Type.UNCASED_SAVED.ordinal()] != null ?
+ dicts[Type.UNCASED_SAVED.ordinal()] :
+ dicts[Type.RAW_SAVED.ordinal()];
+
savedBigrams = new DiskBiGramDictionary(dictFile, postIn[0],
DiskDictionary.PostingsInputType.FILE_PART_POST,
DiskDictionary.BufferType.FILEBUFFER,
256, 2048, 2048, 2048,
2048,
null,
- dicts[Type.UNCASED_SAVED.
- ordinal()]);
+ savedValueDict);
+ logger.info(String.format("field: %s", info.getName()));
+ logger.info(String.format("savedBG header: %s",
savedBigrams.getHeader()));
+ logger.info(String.format("savedUN header: %s",
savedValueDict.getHeader()));
savedBigrams.setPartition(field.partition);
}
@@ -415,23 +425,23 @@
return null;
}
QueryEntry[] qes;
- if(caseSensitive) {
+ if (caseSensitive) {
qes = dicts[Type.RAW_SAVED.ordinal()].getMatching(savedBigrams,
val,
- true,
- maxEntries,
- timeLimit);
+ true,
+ maxEntries,
+ timeLimit);
} else {
- if(dicts[Type.UNCASED_SAVED.ordinal()] == null) {
+ if (dicts[Type.UNCASED_SAVED.ordinal()] == null) {
logger.warning(String.format(
"Can't get uncased matches for string field %s",
info.getName()));
return null;
}
- qes =
dicts[Type.UNCASED_SAVED.ordinal()].getMatching(savedBigrams, val.
- toLowerCase(),
- false,
- maxEntries,
- timeLimit);
+ qes =
dicts[Type.UNCASED_SAVED.ordinal()].getMatching(savedBigrams,
+ val.toLowerCase(),
+ false,
+ maxEntries,
+ timeLimit);
}
return new ArrayDictionaryIterator(qes);
}
diff -r 593b8eda9d7d -r 61833746ad29
Minion/src/com/sun/labs/minion/indexer/MemoryDictionaryBundle.java
--- a/Minion/src/com/sun/labs/minion/indexer/MemoryDictionaryBundle.java
Fri Mar 25 17:16:31 2011 -0400
+++ b/Minion/src/com/sun/labs/minion/indexer/MemoryDictionaryBundle.java
Sun Apr 03 16:36:49 2011 -0700
@@ -3,6 +3,7 @@
import com.sun.labs.minion.FieldInfo;
import com.sun.labs.minion.indexer.dictionary.DateNameHandler;
+import com.sun.labs.minion.indexer.dictionary.DictionaryWriter;
import com.sun.labs.minion.indexer.dictionary.DoubleNameHandler;
import com.sun.labs.minion.indexer.dictionary.LongNameHandler;
import com.sun.labs.minion.indexer.dictionary.MemoryBiGramDictionary;
@@ -30,6 +31,7 @@
import java.util.Arrays;
import java.util.Date;
import java.util.List;
+import java.util.logging.Level;
import java.util.logging.Logger;
/**
@@ -116,7 +118,9 @@
/**
* An array of the sets of entries saved per document at indexing time.
*/
- private List[] dv = new List[128];
+ private List[] dv;
+
+ private List[] ucdv;
private EntryFactory vectorEntryFactory = new
EntryFactory(Postings.Type.ID_FREQ);
@@ -152,10 +156,12 @@
}
if(field.saved) {
+ dv = new List[128];
dicts[Type.RAW_SAVED.ordinal()] = new
MemoryDictionary<N>(savedEntryFactory);
if(field.uncased) {
dicts[Type.UNCASED_SAVED.ordinal()] = new
MemoryDictionary<N>(
savedEntryFactory);
+ ucdv = new List[128];
}
}
@@ -260,40 +266,38 @@
return;
}
- IndexEntry savedEntry = null;
-
- if(info.getType() == FieldInfo.Type.STRING) {
- if((!field.uncased && !field.cased) || field.cased) {
- savedEntry = dicts[Type.RAW_SAVED.ordinal()].put(name);
+ //
+ // We'll just store the saved entries per-document, since we might
be adding
+ // entries in non-document ID order (e.g., when doing
classification.)
+ // We'll build the actual postings lists at dump time.
+ if (dicts[Type.RAW_SAVED.ordinal()] != null) {
+ IndexEntry rawSavedEntry =
dicts[Type.RAW_SAVED.ordinal()].put(name);
+ if (docID >= dv.length) {
+ dv = Arrays.copyOf(dv, (docID + 1) * 2);
}
- if(field.uncased) {
- IndexEntry uce =
dicts[Type.UNCASED_SAVED.ordinal()].put(CharUtils.
- toLowerCase(
- name.toString()));
-
- //
- // If there was no cased version saved, we'll keep the
uncased version.
- if(savedEntry == null) {
- savedEntry = uce;
- }
+ if (dv[docID] == null) {
+ dv[docID] = new ArrayList<IndexEntry>();
}
-
- } else {
- savedEntry = dicts[Type.RAW_SAVED.ordinal()].put(name);
+ dv[docID].add(rawSavedEntry);
}
//
- // We'll just store the entries per-document, since we might be
adding
- // entries in non-document ID order. We'll build the actual postings
- // lists at dump time.
- if(docID >= dv.length) {
- dv = Arrays.copyOf(dv, (docID+1) * 2);
+ // Handle the uncased values for string fields.
+ if (dicts[Type.UNCASED_SAVED.ordinal()] != null) {
+ IndexEntry uncasedSavedEntry =
dicts[Type.UNCASED_SAVED.ordinal()].
+ put(CharUtils.toLowerCase(
+ name.toString()));
+ if (docID >= ucdv.length) {
+ ucdv = Arrays.copyOf(ucdv, (docID + 1) * 2);
+ }
+
+ if (ucdv[docID] == null) {
+ ucdv[docID] = new ArrayList<IndexEntry>();
+ }
+ ucdv[docID].add(uncasedSavedEntry);
}
- if(dv[docID] == null) {
- dv[docID] = new ArrayList<IndexEntry>();
- }
- dv[docID].add(savedEntry);
+
}
/**
@@ -383,9 +387,9 @@
// in document ID order.
Occurrence uo = new OccurrenceImpl();
for(int i = 0; i < dv.length; i++) {
- if(dv[i] != null) {
+ if(ucdv[i] != null) {
uo.setID(i);
- for(IndexEntry e : (List<IndexEntry>) dv[i]) {
+ for(IndexEntry e : (List<IndexEntry>) ucdv[i]) {
e.add(uo);
}
}
@@ -424,18 +428,20 @@
tbg.add(e.getName().toString(), e.getID());
}
header.tokenBGOffset = fieldDictFile.getFilePointer();
- tbg.dump(path, new StringNameHandler(), fieldDictFile, postOut,
- MemoryDictionary.Renumber.RENUMBER,
- MemoryDictionary.IDMap.NONE,
- null);
+ tbg.dump(path, new StringNameHandler(),
+ fieldDictFile, postOut,
+ MemoryDictionary.Renumber.RENUMBER,
+ MemoryDictionary.IDMap.NONE,
+ null);
} else {
header.tokenBGOffset = -1;
}
if(field.info.getType() == FieldInfo.Type.STRING) {
IndexEntry[] sortedSaved =
- sortedEntries[Type.UNCASED_SAVED.ordinal()] != null ?
sortedEntries[Type.UNCASED_SAVED.
- ordinal()] : sortedEntries[Type.RAW_SAVED.ordinal()];
+ sortedEntries[Type.UNCASED_SAVED.ordinal()] != null ?
+ sortedEntries[Type.UNCASED_SAVED.ordinal()] :
+ sortedEntries[Type.RAW_SAVED.ordinal()];
if(sortedSaved != null) {
MemoryBiGramDictionary sbg = new MemoryBiGramDictionary(
new EntryFactory(Postings.Type.ID_FREQ));
@@ -443,10 +449,16 @@
sbg.add(e.getName().toString(), e.getID());
}
header.savedBGOffset = fieldDictFile.getFilePointer();
+// Logger dl =
Logger.getLogger(MemoryDictionary.class.getName());
+// Logger wl =
Logger.getLogger(DictionaryWriter.class.getName());
+// dl.setLevel(Level.FINE);
+// wl.setLevel(Level.FINE);
sbg.dump(path, new StringNameHandler(), fieldDictFile,
postOut,
MemoryDictionary.Renumber.NONE,
MemoryDictionary.IDMap.NONE,
null);
+// dl.setLevel(Level.INFO);
+// wl.setLevel(Level.INFO);
} else {
header.savedBGOffset = -1;
}
diff -r 593b8eda9d7d -r 61833746ad29
Minion/src/com/sun/labs/minion/indexer/dictionary/DiskBiGramDictionary.java
---
a/Minion/src/com/sun/labs/minion/indexer/dictionary/DiskBiGramDictionary.java
Fri Mar 25 17:16:31 2011 -0400
+++
b/Minion/src/com/sun/labs/minion/indexer/dictionary/DiskBiGramDictionary.java
Sun Apr 03 16:36:49 2011 -0700
@@ -30,7 +30,6 @@
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
-import com.sun.labs.minion.indexer.postings.io.PostingsOutput;
import com.sun.labs.minion.retrieval.ArrayGroup;
import com.sun.labs.minion.retrieval.ScoredGroup;
diff -r 593b8eda9d7d -r 61833746ad29
Minion/src/com/sun/labs/minion/indexer/dictionary/DiskDictionary.java
--- a/Minion/src/com/sun/labs/minion/indexer/dictionary/DiskDictionary.java
Fri Mar 25 17:16:31 2011 -0400
+++ b/Minion/src/com/sun/labs/minion/indexer/dictionary/DiskDictionary.java
Sun Apr 03 16:36:49 2011 -0700
@@ -714,6 +714,7 @@
//
// Bounds check.
if(posn < 0 || posn >= dh.size) {
+ logger.info(String.format("posn: %d size: %d", posn, dh.size));
return null;
}
@@ -870,6 +871,8 @@
//
// First, get the matching entry IDs from the bigram dictionary.
int[] entryIds = biDict.getMatching(pat);
+
+ logger.info(String.format("candidates: %s",
Arrays.toString(entryIds)));
if(qtt.timedOut) {
// Operation timed out
@@ -912,6 +915,7 @@
(!qtt.timedOut) &&
((maxEntries <= 0) || (res.size() < maxEntries)); i++) {
QueryEntry curr = getByID(entryIds[i]);
+ logger.info(String.format("%d: %s", entryIds[i], curr));
if(Util.match(patArray, curr.toString().toCharArray(),
caseSensitive)) {
res.add(curr);
diff -r 593b8eda9d7d -r 61833746ad29
Minion/src/com/sun/labs/minion/indexer/dictionary/MemoryBiGramDictionary.java
---
a/Minion/src/com/sun/labs/minion/indexer/dictionary/MemoryBiGramDictionary.java
Fri Mar 25 17:16:31 2011 -0400
+++
b/Minion/src/com/sun/labs/minion/indexer/dictionary/MemoryBiGramDictionary.java
Sun Apr 03 16:36:49 2011 -0700
@@ -24,9 +24,7 @@
package com.sun.labs.minion.indexer.dictionary;
-import com.sun.labs.minion.util.CharUtils;
-import com.sun.labs.minion.indexer.entry.Entry;
import com.sun.labs.minion.indexer.entry.EntryFactory;
import com.sun.labs.minion.indexer.entry.IndexEntry;
@@ -34,8 +32,6 @@
public class MemoryBiGramDictionary extends MemoryDictionary {
- protected static String logTag = "MBG";
-
/**
* Creates a bigram dictionary. Such a dictionary will be populated a
* term at a time.
diff -r 593b8eda9d7d -r 61833746ad29
Minion/src/com/sun/labs/minion/indexer/dictionary/MemoryDictionary.java
--- a/Minion/src/com/sun/labs/minion/indexer/dictionary/MemoryDictionary.java
Fri Mar 25 17:16:31 2011 -0400
+++ b/Minion/src/com/sun/labs/minion/indexer/dictionary/MemoryDictionary.java
Sun Apr 03 16:36:49 2011 -0700
@@ -341,7 +341,7 @@
IDMap idMapType,
int[] postIDMap)
throws java.io.IOException {
- logger.fine("Dumping: " + map.size() + " entries");
+ logger.fine(String.format("Dumping %d entries", map.size()));
//
// Get a writer for the dictionary. If we're not renumbering, it
@@ -373,7 +373,10 @@
//
// Write it out.
if(entry.writePostings(postOut, postIDMap) == true) {
+ logger.fine(String.format("wrote postings for %s, %d entries
in list", entry.getName(), entry.getN()));
dw.write(entry);
+ } else {
+ logger.fine(String.format("No postings for %s",
entry.getName()));
}
}
@@ -387,6 +390,8 @@
//
// Write the final dictionary.
dw.finish(dictFile);
+
+ logger.fine(String.format("header:\n%s", dw.dh));
return sorted;
}
diff -r 593b8eda9d7d -r 61833746ad29
Minion/src/com/sun/labs/minion/test/QueryTest.java
--- a/Minion/src/com/sun/labs/minion/test/QueryTest.java Fri Mar 25
17:16:31 2011 -0400
+++ b/Minion/src/com/sun/labs/minion/test/QueryTest.java Sun Apr 03
16:36:49 2011 -0700
@@ -725,10 +725,11 @@
for(DiskField df :
((InvFileDiskPartition) p).getDiskFields()) {
output.format("Field: %s\n", df.getInfo().getName());
+ output.flush();
List<QueryEntry> entries = df.getMatching(pat,
wildCaseSensitive,
-1, -1);
- if(entries.size() == 0) {
+ if(entries.isEmpty()) {
output.println("No matches");
} else {
output.format("%d matches\n", entries.size());
diff -r 61833746ad29 -r ae83b73f5591
Minion/src/com/sun/labs/minion/indexer/DiskDictionaryBundle.java
--- a/Minion/src/com/sun/labs/minion/indexer/DiskDictionaryBundle.java Sun
Apr 03 16:36:49 2011 -0700
+++ b/Minion/src/com/sun/labs/minion/indexer/DiskDictionaryBundle.java Sun
Apr 03 17:41:22 2011 -0700
@@ -36,6 +36,7 @@
import java.nio.channels.FileChannel;
import java.util.ArrayList;
import java.util.Arrays;
+import java.util.Collections;
import java.util.List;
import java.util.PriorityQueue;
import java.util.logging.Logger;
@@ -145,9 +146,9 @@
if(header.tokenBGOffset > 0) {
dictFile.seek(header.tokenBGOffset);
DiskDictionary tokenDict =
- dicts[Type.UNCASED_TOKENS.ordinal()] != null ?
- dicts[Type.UNCASED_TOKENS.ordinal()] :
- dicts[Type.CASED_TOKENS.ordinal()];
+ dicts[Type.CASED_TOKENS.ordinal()] != null ?
+ dicts[Type.CASED_TOKENS.ordinal()] :
+ dicts[Type.UNCASED_TOKENS.ordinal()];
tokenBigrams = new DiskBiGramDictionary(dictFile, postIn[0],
DiskDictionary.PostingsInputType.FILE_PART_POST,
DiskDictionary.BufferType.FILEBUFFER,
@@ -159,20 +160,12 @@
if(header.savedBGOffset > 0) {
dictFile.seek(header.savedBGOffset);
- DiskDictionary savedValueDict =
- dicts[Type.UNCASED_SAVED.ordinal()] != null ?
- dicts[Type.UNCASED_SAVED.ordinal()] :
- dicts[Type.RAW_SAVED.ordinal()];
-
savedBigrams = new DiskBiGramDictionary(dictFile, postIn[0],
DiskDictionary.PostingsInputType.FILE_PART_POST,
DiskDictionary.BufferType.FILEBUFFER,
256, 2048, 2048, 2048,
2048,
null,
- savedValueDict);
- logger.info(String.format("field: %s", info.getName()));
- logger.info(String.format("savedBG header: %s",
savedBigrams.getHeader()));
- logger.info(String.format("savedUN header: %s",
savedValueDict.getHeader()));
+
dicts[Type.RAW_SAVED.ordinal()]);
savedBigrams.setPartition(field.partition);
}
@@ -285,7 +278,33 @@
}
return ret;
}
+
+ public List<QueryEntry> getWildcardMatches(String pat, boolean
caseSensitive,
+ int maxEntries, long timeLimit) {
+
+ DiskDictionary tokenDict =
+ dicts[Type.CASED_TOKENS.ordinal()] != null ?
+ dicts[Type.CASED_TOKENS.ordinal()] :
+ dicts[Type.UNCASED_TOKENS.ordinal()];
+
+ if(tokenDict == null) {
+ return Collections.EMPTY_LIST;
+ }
+
+ QueryEntry[] qes = tokenDict.getMatching(
+ tokenBigrams, pat,
+ caseSensitive,
+ maxEntries,
+ timeLimit);
+ if(qes == null) {
+ return Collections.EMPTY_LIST;
+ }
+
+ return Arrays.asList(qes);
+
+ }
+
public QueryEntry getStem(String stem) {
QueryEntry ret = null;
if(dicts[Type.STEMMED_TOKENS.ordinal()] != null) {
@@ -424,25 +443,11 @@
getName()));
return null;
}
- QueryEntry[] qes;
- if (caseSensitive) {
- qes = dicts[Type.RAW_SAVED.ordinal()].getMatching(savedBigrams,
val,
- true,
- maxEntries,
- timeLimit);
- } else {
- if (dicts[Type.UNCASED_SAVED.ordinal()] == null) {
- logger.warning(String.format(
- "Can't get uncased matches for string field %s",
- info.getName()));
- return null;
- }
- qes =
dicts[Type.UNCASED_SAVED.ordinal()].getMatching(savedBigrams,
- val.toLowerCase(),
- false,
- maxEntries,
- timeLimit);
- }
+ QueryEntry[] qes = dicts[Type.RAW_SAVED.ordinal()].getMatching(
+ savedBigrams, val,
+ caseSensitive,
+ maxEntries,
+ timeLimit);
return new ArrayDictionaryIterator(qes);
}
@@ -471,8 +476,10 @@
DictionaryIterator di = getMatchingIterator(pattern,
caseSensitive,
maxEntries,
timeLimit);
- while(di.hasNext()) {
- ret.add((QueryEntry) di.next());
+ if (di != null) {
+ while (di.hasNext()) {
+ ret.add((QueryEntry) di.next());
+ }
}
}
return ret;
diff -r 61833746ad29 -r ae83b73f5591
Minion/src/com/sun/labs/minion/indexer/DiskField.java
--- a/Minion/src/com/sun/labs/minion/indexer/DiskField.java Sun Apr 03
16:36:49 2011 -0700
+++ b/Minion/src/com/sun/labs/minion/indexer/DiskField.java Sun Apr 03
17:41:22 2011 -0700
@@ -67,6 +67,13 @@
public QueryEntry getTerm(int id, boolean caseSensitive) {
return bundle.getTerm(id, caseSensitive);
}
+
+ public List<QueryEntry> getWildcardMatches(String name, boolean
caseSensitive,
+ int maxEntries,
+ long timeLimit) {
+ return bundle.getWildcardMatches(name, caseSensitive, maxEntries,
+ timeLimit);
+ }
public TermStatsImpl getTermStats(String name) {
return partition.getPartitionManager().getTermStats(name, info);
diff -r 61833746ad29 -r ae83b73f5591
Minion/src/com/sun/labs/minion/indexer/MemoryDictionaryBundle.java
--- a/Minion/src/com/sun/labs/minion/indexer/MemoryDictionaryBundle.java
Sun Apr 03 16:36:49 2011 -0700
+++ b/Minion/src/com/sun/labs/minion/indexer/MemoryDictionaryBundle.java
Sun Apr 03 17:41:22 2011 -0700
@@ -417,15 +417,15 @@
// If we have tokens or saved values, then output any bigrams that we
// need for accelerating wildcards.
IndexEntry[] sortedTokens =
- sortedEntries[Type.UNCASED_TOKENS.ordinal()] != null ?
- sortedEntries[Type.UNCASED_TOKENS.ordinal()] :
- sortedEntries[Type.CASED_TOKENS.ordinal()];
+ sortedEntries[Type.CASED_TOKENS.ordinal()] != null ?
+ sortedEntries[Type.CASED_TOKENS.ordinal()] :
+ sortedEntries[Type.UNCASED_TOKENS.ordinal()];
if(sortedTokens != null) {
MemoryBiGramDictionary tbg = new MemoryBiGramDictionary(
new EntryFactory(Postings.Type.ID_FREQ));
for(IndexEntry e : sortedTokens) {
- tbg.add(e.getName().toString(), e.getID());
+ tbg.add(CharUtils.toLowerCase(e.getName().toString()),
e.getID());
}
header.tokenBGOffset = fieldDictFile.getFilePointer();
tbg.dump(path, new StringNameHandler(),
@@ -438,30 +438,19 @@
}
if(field.info.getType() == FieldInfo.Type.STRING) {
- IndexEntry[] sortedSaved =
- sortedEntries[Type.UNCASED_SAVED.ordinal()] != null ?
- sortedEntries[Type.UNCASED_SAVED.ordinal()] :
- sortedEntries[Type.RAW_SAVED.ordinal()];
- if(sortedSaved != null) {
- MemoryBiGramDictionary sbg = new MemoryBiGramDictionary(
- new EntryFactory(Postings.Type.ID_FREQ));
- for(IndexEntry e : sortedSaved) {
- sbg.add(e.getName().toString(), e.getID());
- }
- header.savedBGOffset = fieldDictFile.getFilePointer();
-// Logger dl =
Logger.getLogger(MemoryDictionary.class.getName());
-// Logger wl =
Logger.getLogger(DictionaryWriter.class.getName());
-// dl.setLevel(Level.FINE);
-// wl.setLevel(Level.FINE);
- sbg.dump(path, new StringNameHandler(), fieldDictFile,
postOut,
- MemoryDictionary.Renumber.NONE,
- MemoryDictionary.IDMap.NONE,
- null);
-// dl.setLevel(Level.INFO);
-// wl.setLevel(Level.INFO);
- } else {
- header.savedBGOffset = -1;
+ MemoryBiGramDictionary sbg = new MemoryBiGramDictionary(
+ new EntryFactory(Postings.Type.ID_FREQ));
+ for (IndexEntry e : sortedEntries[Type.RAW_SAVED.ordinal()]) {
+ sbg.add(CharUtils.toLowerCase(e.getName().toString()),
+ e.getID());
}
+ header.savedBGOffset = fieldDictFile.getFilePointer();
+ sbg.dump(path, new StringNameHandler(), fieldDictFile, postOut,
+ MemoryDictionary.Renumber.NONE,
+ MemoryDictionary.IDMap.NONE,
+ null);
+ } else {
+ header.savedBGOffset = -1;
}
diff -r 61833746ad29 -r ae83b73f5591
Minion/src/com/sun/labs/minion/indexer/dictionary/DiskBiGramDictionary.java
---
a/Minion/src/com/sun/labs/minion/indexer/dictionary/DiskBiGramDictionary.java
Sun Apr 03 16:36:49 2011 -0700
+++
b/Minion/src/com/sun/labs/minion/indexer/dictionary/DiskBiGramDictionary.java
Sun Apr 03 17:41:22 2011 -0700
@@ -21,7 +21,6 @@
* Park, CA 94025 or visit www.sun.com if you need additional
* information or have any questions.
*/
-
package com.sun.labs.minion.indexer.dictionary;
import java.io.RandomAccessFile;
@@ -55,29 +54,29 @@
private DiskDictionary mainDict;
public DiskBiGramDictionary(RandomAccessFile dictFile,
- RandomAccessFile postFile,
- PostingsInputType postInType,
- BufferType fileBufferType,
- int cacheSize,
- int nameBufferSize,
- int offsetsBufferSize,
- int infoBufferSize,
- int infoOffsetsBufferSize,
- Partition part,
- DiskDictionary mainDict)
+ RandomAccessFile postFile,
+ PostingsInputType postInType,
+ BufferType fileBufferType,
+ int cacheSize,
+ int nameBufferSize,
+ int offsetsBufferSize,
+ int infoBufferSize,
+ int infoOffsetsBufferSize,
+ Partition part,
+ DiskDictionary mainDict)
throws java.io.IOException {
super(new EntryFactory(Type.ID_FREQ),
- new StringNameHandler(),
- dictFile,
- new RandomAccessFile[]{postFile},
- postInType,
- fileBufferType,
- cacheSize,
- nameBufferSize,
- offsetsBufferSize,
- infoBufferSize,
- infoOffsetsBufferSize,
- part);
+ new StringNameHandler(),
+ dictFile,
+ new RandomAccessFile[]{postFile},
+ postInType,
+ fileBufferType,
+ cacheSize,
+ nameBufferSize,
+ offsetsBufferSize,
+ infoBufferSize,
+ infoOffsetsBufferSize,
+ part);
this.mainDict = mainDict;
} // DiskBiGramDictionary constructor
@@ -116,11 +115,11 @@
* tested.
*/
public int[] getMatching(String wc,
- boolean starts,
- boolean ends) {
+ boolean starts,
+ boolean ends) {
//
// Quick sanity check.
- if(wc.length() == 0 || size() == 0) {
+ if (wc.length() == 0 || size() == 0) {
return null;
}
@@ -142,7 +141,7 @@
// calculating bigrams will depend on whether the string has to be
// anchored at one end of the pattern or another.
int b, e;
- if(starts) {
+ if (starts) {
bg[0] = (char) 0;
b = 0;
} else {
@@ -151,20 +150,19 @@
}
int l = wc.length();
- if(ends) {
+ if (ends) {
e = l;
} else {
e = l - 1;
}
- for(int i = b; i <= e; i++) {
+ for (int i = b; i <= e; i++) {
bg[1] = (ends && i == e) ? (char) 0 : wc.charAt(i);
//
// If there's any wildcard character, we can't have a bigram!
- if(bg[0] == '*' || bg[0] == '?' ||
- bg[1] == '*' || bg[1] == '?') {
+ if (bg[0] == '*' || bg[0] == '?' || bg[1] == '*' || bg[1] ==
'?') {
bg[0] = bg[1];
continue;
}
@@ -175,7 +173,7 @@
// dictionary we're associated with won't contain any entry
// whose name contains this bigram!
Entry bigram = get(new String(bg));
- if(bigram == null) {
+ if (bigram == null) {
return null;
}
bigrams.add(bigram);
@@ -188,7 +186,7 @@
//
// If we have bigrams, then make our ID list.
- if(bigrams.size() > 0) {
+ if (bigrams.size() > 0) {
ag = intersect(bigrams);
} else {
@@ -196,14 +194,14 @@
// Pull out any unigrams and use those.
char[] ug = new char[wc.length()];
int nu = 0;
- for(int i = 0; i < wc.length(); i++) {
+ for (int i = 0; i < wc.length(); i++) {
char c = wc.charAt(i);
- if(c != '*' && c != '?') {
+ if (c != '*' && c != '?') {
ug[nu++] = c;
}
}
- if(nu == 0) {
+ if (nu == 0) {
//
// No unigrams, no bigrams. It's all wildcards, so we need
to check
// everything.
@@ -217,7 +215,7 @@
// all the occurrences of the unigram. We'll union together
// the postings for these and then intersect the resulting
// unions.
- for(int i = 0; i < nu; i++) {
+ for (int i = 0; i < nu; i++) {
//
// Get this character and the next greater one.
@@ -225,7 +223,7 @@
String upper = Character.toString((char) ((int) ug[i] + 1));
ArrayGroup curr = getUnigrams(lower, upper);
- if(ag == null) {
+ if (ag == null) {
ag = curr;
} else {
ag = ag.intersect(curr);
@@ -233,7 +231,7 @@
//
// If we ever drop to zero size, we're done.
- if(ag.getSize() == 0) {
+ if (ag.getSize() == 0) {
return null;
}
}
@@ -243,7 +241,7 @@
//
// At this point, no hits means no matches.
- if(ret.length == 0) {
+ if (ret.length == 0) {
return null;
}
return ret;
@@ -264,7 +262,7 @@
public int[] getAllVariants(String wc, boolean allowPartial) {
//
// Quick sanity check.
- if(wc.length() == 0 || size() == 0) {
+ if (wc.length() == 0 || size() == 0) {
return null;
}
@@ -285,7 +283,7 @@
// bigrams.
bg[0] = wc.charAt(0);
- for(int i = 1; i < wc.length(); i++) {
+ for (int i = 1; i < wc.length(); i++) {
bg[1] = wc.charAt(i);
@@ -296,8 +294,8 @@
// dictionary we're associated with won't contain any entry
// whose name contains this bigram!
Entry bigram = get(new String(bg));
- if(bigram == null) {
- if(allowPartial) {
+ if (bigram == null) {
+ if (allowPartial) {
bg[0] = bg[1];
continue;
} else {
@@ -314,7 +312,7 @@
//
// If we have bigrams, then make our ID list.
- if(bigrams.size() > 0) {
+ if (bigrams.size() > 0) {
ag = union(bigrams);
} else {
@@ -324,7 +322,7 @@
}
int[] ret = ag.getDocs();
- if(ret.length == 0) {
+ if (ret.length == 0) {
return null;
}
return ret;
@@ -343,9 +341,9 @@
Collections.sort(entries, new EntrySizeComparator());
ArrayGroup ag = null;
PostingsIteratorFeatures feat = new PostingsIteratorFeatures();
- for(Iterator i = entries.iterator(); i.hasNext();) {
+ for (Iterator i = entries.iterator(); i.hasNext();) {
QueryEntry e = (QueryEntry) i.next();
- if(ag == null) {
+ if (ag == null) {
ag = new ArrayGroup(e.iterator(feat));
} else {
ag = ag.destructiveIntersect(e.iterator(feat));
@@ -366,14 +364,14 @@
// Throw all the entries into a ScoredQuickOr, possibly tossing
// out entries below a particular score?
ScoredQuickOr qor = new ScoredQuickOr((DiskPartition) part,
- mainDict.getMaxID());
+ mainDict.getMaxID());
PostingsIteratorFeatures feat = new PostingsIteratorFeatures();
- for(Iterator it = entries.iterator(); it.hasNext();) {
+ for (Iterator it = entries.iterator(); it.hasNext();) {
QueryEntry e = (QueryEntry) it.next();
qor.add(e.iterator(feat));
}
ScoredGroup sg = (ScoredGroup) qor.getGroup();
- if(sg.getSize() >= 1000) {
+ if (sg.getSize() >= 1000) {
sg.sort(true);
float nthScore = sg.getScore(1000);
sg.discardBelow(nthScore);
@@ -386,27 +384,27 @@
* the bigrams that have that character as a first character.
*/
protected ArrayGroup getUnigrams(String lower,
- String upper) {
+ String upper) {
DictionaryIterator di = iterator(lower, true,
- upper, false);
+ upper, false);
int[] ids = new int[di.estimateSize()];
int p = 0;
PostingsIteratorFeatures feat = new PostingsIteratorFeatures();
- while(di.hasNext()) {
+ while (di.hasNext()) {
PostingsIterator pi = ((QueryEntry) di.next()).iterator(feat);
- if(pi == null) {
+ if (pi == null) {
continue;
}
- while(pi.next()) {
+ while (pi.next()) {
ids[p++] = pi.getID();
}
}
java.util.Arrays.sort(ids, 0, p);
int size = 0;
int prev = -1;
- for(int i = 0; i < p; i++) {
- if(ids[i] != prev) {
+ for (int i = 0; i < p; i++) {
+ if (ids[i] != prev) {
ids[size++] = ids[i];
}
prev = ids[i];
@@ -414,10 +412,10 @@
return new ArrayGroup(ids, size);
}
- public static void merge(MergeState mergeState,
- DiskBiGramDictionary[] dicts ) throws java.io.IOException {
+ public static void merge(MergeState mergeState,
+ DiskBiGramDictionary[] dicts) throws java.io.IOException {
DiskDictionary.merge(
- mergeState.manager.getIndexDir(),
+ mergeState.manager.getIndexDir(),
new StringNameHandler(),
(DiskDictionary[]) dicts,
null,
diff -r 61833746ad29 -r ae83b73f5591
Minion/src/com/sun/labs/minion/indexer/dictionary/DiskDictionary.java
--- a/Minion/src/com/sun/labs/minion/indexer/dictionary/DiskDictionary.java
Sun Apr 03 16:36:49 2011 -0700
+++ b/Minion/src/com/sun/labs/minion/indexer/dictionary/DiskDictionary.java
Sun Apr 03 17:41:22 2011 -0700
@@ -714,7 +714,9 @@
//
// Bounds check.
if(posn < 0 || posn >= dh.size) {
- logger.info(String.format("posn: %d size: %d", posn, dh.size));
+ if(posn < 0) {
+ logger.log(Level.INFO, String.format("posn: %d size: %d", posn,
dh.size), new Exception("Negative posn?"));
+ }
return null;
}
@@ -870,16 +872,14 @@
//
// First, get the matching entry IDs from the bigram dictionary.
- int[] entryIds = biDict.getMatching(pat);
+ int[] candidateEntryIDs = biDict.getMatching(pat);
- logger.info(String.format("candidates: %s",
Arrays.toString(entryIds)));
-
if(qtt.timedOut) {
// Operation timed out
return new QueryEntry[0];
}
- if(entryIds == null) {
+ if(candidateEntryIDs == null) {
// There's nothing that could match
return null;
}
@@ -895,7 +895,7 @@
//
// Now check the entry IDs.
- if(entryIds.length == 0) {
+ if(candidateEntryIDs.length == 0) {
// There's no bigrams or unigrams to narrow down by.
// Try everything:
Iterator entryIt = iterator();
@@ -911,11 +911,10 @@
//
// Now look up each entry and see if it matches the
// result.
- for(int i = 0; (i < entryIds.length) && (entryIds[i] != 0) &&
+ for(int i = 0; (i < candidateEntryIDs.length) &&
(candidateEntryIDs[i] != 0) &&
(!qtt.timedOut) &&
((maxEntries <= 0) || (res.size() < maxEntries)); i++) {
- QueryEntry curr = getByID(entryIds[i]);
- logger.info(String.format("%d: %s", entryIds[i], curr));
+ QueryEntry curr = getByID(candidateEntryIDs[i]);
if(Util.match(patArray, curr.toString().toCharArray(),
caseSensitive)) {
res.add(curr);
diff -r 61833746ad29 -r ae83b73f5591
Minion/src/com/sun/labs/minion/indexer/partition/InvFileDiskPartition.java
---
a/Minion/src/com/sun/labs/minion/indexer/partition/InvFileDiskPartition.java
Sun Apr 03 16:36:49 2011 -0700
+++
b/Minion/src/com/sun/labs/minion/indexer/partition/InvFileDiskPartition.java
Sun Apr 03 17:41:22 2011 -0700
@@ -239,7 +239,7 @@
return f.fetchOne(docID);
}
}
-
+
/**
* Gets an iterator for all of the values in a field.
*
diff -r 61833746ad29 -r ae83b73f5591
Minion/src/com/sun/labs/minion/test/QueryTest.java
--- a/Minion/src/com/sun/labs/minion/test/QueryTest.java Sun Apr 03
16:36:49 2011 -0700
+++ b/Minion/src/com/sun/labs/minion/test/QueryTest.java Sun Apr 03
17:41:22 2011 -0700
@@ -686,7 +686,7 @@
output.println("model:" + currModel.toString());
String[] labels = currModel.getSenseLabels();
for(String l : labels) {
- logger.info("label: " + l);
+ logger.info(String.format("label: %s", l));
}
} catch(Exception e) {
System.err.println("Exception during disambiguation");
@@ -724,6 +724,41 @@
for(DiskPartition p : manager.getActivePartitions()) {
for(DiskField df :
((InvFileDiskPartition) p).getDiskFields()) {
+
+ if(!df.getInfo().hasAttribute(
+ FieldInfo.Attribute.TOKENIZED)) {
+ continue;
+ }
+ output.format("Field: %s\n", df.getInfo().getName());
+ List<QueryEntry> entries = df.getWildcardMatches(pat,
+
wildCaseSensitive,
+ -1, -1);
+ if(entries.isEmpty()) {
+ output.println("No matches");
+ } else {
+ output.format("%d token matches\n",
entries.size());
+ for(QueryEntry e : entries) {
+ output.format(" %s (%d)\n", e.getName(),
+ e.getN());
+ }
+ }
+ }
+ }
+
+ } catch(Exception e) {
+ logger.log(Level.SEVERE, "Exception in :wild", e);
+ return 0;
+ }
+ } else if(q.startsWith(":fwild ")) {
+
+ try {
+ String pat = q.substring(q.indexOf(' ') + 1).trim();
+ for(DiskPartition p : manager.getActivePartitions()) {
+ for(DiskField df :
+ ((InvFileDiskPartition) p).getDiskFields()) {
+ if(df.getInfo().getType() != FieldInfo.Type.STRING)
{
+ continue;
+ }
output.format("Field: %s\n", df.getInfo().getName());
output.flush();
List<QueryEntry> entries = df.getMatching(pat,
@@ -732,9 +767,8 @@
if(entries.isEmpty()) {
output.println("No matches");
} else {
- output.format("%d matches\n", entries.size());
+ output.format("%d saved value matches\n",
entries.size());
for(QueryEntry e : entries) {
- output.println("");
output.format(" %s (%d)\n", e.getName(),
e.getN());
}
diff -r ae83b73f5591 -r 5b1430509004
Minion/src/com/sun/labs/minion/indexer/DiskDictionaryBundle.java
--- a/Minion/src/com/sun/labs/minion/indexer/DiskDictionaryBundle.java Sun
Apr 03 17:41:22 2011 -0700
+++ b/Minion/src/com/sun/labs/minion/indexer/DiskDictionaryBundle.java Sun
Apr 03 17:47:18 2011 -0700
@@ -39,6 +39,7 @@
import java.util.Collections;
import java.util.List;
import java.util.PriorityQueue;
+import java.util.logging.Level;
import java.util.logging.Logger;
/**
@@ -443,6 +444,7 @@
getName()));
return null;
}
+
QueryEntry[] qes = dicts[Type.RAW_SAVED.ordinal()].getMatching(
savedBigrams, val,
caseSensitive,
diff -r ae83b73f5591 -r 5b1430509004
Minion/src/com/sun/labs/minion/indexer/dictionary/DiskDictionary.java
--- a/Minion/src/com/sun/labs/minion/indexer/dictionary/DiskDictionary.java
Sun Apr 03 17:41:22 2011 -0700
+++ b/Minion/src/com/sun/labs/minion/indexer/dictionary/DiskDictionary.java
Sun Apr 03 17:47:18 2011 -0700
@@ -714,9 +714,6 @@
//
// Bounds check.
if(posn < 0 || posn >= dh.size) {
- if(posn < 0) {
- logger.log(Level.INFO, String.format("posn: %d size: %d", posn,
dh.size), new Exception("Negative posn?"));
- }
return null;
}
|
[minion~hg:88] It's ok if we get a negative position while searching. |
stgreen | 04/28/2011 |





