-
Notifications
You must be signed in to change notification settings - Fork 0
/
DocumentParser.java
184 lines (168 loc) · 4.89 KB
/
DocumentParser.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import java.util.Iterator;
/**
* @author Chuka Okoye
*
*/
public class DocumentParser
{
private String document;
private char[] charArray;
/**
* The document to be parsed.
* @param document
*/
public DocumentParser(String document)
{
this.document = document;
this.charArray = document.toCharArray();
}
/**
* A simple function that iterates over all "words" in a specified document
* @return Iterator, an iterator of document content
*/
public Iterator getAllWords()
{
return new WordIterator();
}
/**
* Retrieves the next 15 words in a document. This forms a snippet.
* @param index, the index of the first word from which the snippet will be extracted.
* @return Snippet, the snippet extracted from the original document.
*/
public Snippet getSnippet(int index)
{
WordIterator iter = new WordIterator();
Word aWord;
Snippet aSnippet = new Snippet();
int count = 1;
if(!iter.setIteratorPosition(index))
return null;
while(iter.hasNext() && count <= 15)
{
aWord = (Word)iter.nextFullWord();
if(aWord != null)
aSnippet.addWord(aWord);
else
break;
count++;
}
return aSnippet;
}
/**
* A class implementing the iterable and iterator interface. It allows
* the calling element to have full control over the iteration of a document.
* This means the class only parses the document until the next word rather
* than the whole document content. Also, multiple calling elements can iterate over
* the same document without re-instantiating the Document Parser.
*
* @author Chuka Okoye
*
*/
private class WordIterator implements Iterable<Word>, Iterator<Word>
{
private int currentPointer = 0;
public Iterator<Word> iterator()
{
return this;
}
/**
* A simple method to retrieve all words iteratively in a document. Upon reaching
* a non alpha numeric term, it assumes the end of the word has been reached.
* @return Word, the next word in the document. null object if end is reached.
*/
public Word next()
{
StringBuffer tempBuffer = new StringBuffer();
String tempString = "";
Boolean seeking = true; //Check to ensure we are not seeking first letter of word.
while((currentPointer < charArray.length) && (seeking == true))
{
//Look for beginning of next valid character by checking ASCII value
//valid chars are 0-9, a-z, A-Z.
if(compareASCIICode(charArray[currentPointer]))
{
tempBuffer.append(charArray[currentPointer]);
}
else if(tempBuffer.length() != 0) //Check to ensure we found a word
{
seeking = false;
}
if(seeking == false) //We reached the end of some string
break;
currentPointer++;
}
tempString = tempBuffer.toString();
if(tempString.length() != 0)
return new Word(tempString,(currentPointer-tempString.length()));
else
return null;
}
/**
* A simple method to retrieve all words iteratively in a document. Upon reaching
* a space character, it assumes the end of the word has been reached.
* @return Word, the next word in the document. null object if end is reached.
*/
public Word nextFullWord()
{
StringBuffer tempBuffer = new StringBuffer();
String tempString = "";
Boolean seeking = true; //Check to ensure we are not seeking first letter of word.
while((currentPointer < charArray.length) && (seeking == true))
{
//Look for beginning of next valid character
//Space denotes the end of a word.
if(charArray[currentPointer] != ' ')
{
tempBuffer.append(charArray[currentPointer]);
}
else if(tempBuffer.length() != 0) //Check to ensure we found a word
{
seeking = false;
}
if(seeking == false) //We reached the end of some string
break;
currentPointer++;
}
tempString = tempBuffer.toString();
if(tempString.length() != 0)
return new Word(tempString,(currentPointer-tempString.length()));
else
return null;
}
public boolean hasNext()
{
return (currentPointer <= (charArray.length - 1));
}
/**
* Ensures only valid characters a-z, A-Z and 0-9 are parsed
* out of the document.
* @param character, the character to be verified
* @return boolean true if a valid character.
*/
private boolean compareASCIICode(char character)
{
int value = (int)character;
return ((value >= 48 && value <= 57) || (value >= 65 && value <= 90) || (value >= 97 && value <= 122));
}
public void remove()
{
throw new UnsupportedOperationException();
}
/**
* Allows a calling element to set what index iteration should
* start from. It is up to the caller to ensure an index corresponding
* to the beginning of a word is supplied.
* @param index, the beginning index position
* @return true if updated successfully.
*/
public boolean setIteratorPosition(int index)
{
if(index >= 0)
{
this.currentPointer = index;
return true;
}
return false;
}
}
}