Fastest way to read and search a string in a large file using core java
Hi All,
I want to read a file and search string in file and returns it's related value.
The file contains key and value pair and they are separated by space. Key is string and value is and URL in <a> tag, and the file size is 60MB.
I read the file using file Input stream and stored it into hash Map and performed searching into hash map, and it's taking less than a second to read the file and search a string and return it's value.
The searching in file having some rules as:
1. search string in file can be a sentence, we have to search the matching key into the file and return all the values(URL). and the search string length should be greater than 10.
2. if the search string contains _(underscore) at the end, then if the exact match(key) found in the file then no need to check the length of search string.
e.g. of Search Sting :
This is my first post to Java Programming Forums.
If the file contains Programming key, then we should have to return all URL related to this key.
I tried using file scanner, buffer Reader, File Input Stream and direct search in file and using hash map. But using hash map the performance was better.
Can please anybody suggest me how can I improve the performs so that I can achieve the file read and search within 5 Milliseconds. This will be an great help for me..
Thanks a lot in advance....
-----------
The code I have written to read the file is as mentioned below :-
Please suggest us the changes or any other better way so that we can improve the reading and search performance by less than 5 milliseconds.
1. File CharByCharSearch.java
Code :
import java.io.BufferedReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Set;
public class CharByCharSearch {
private static HashMap<String, String> mapForKeyValues = new HashMap<String, String>();
private static CharByCharSearch getHtml = null;
private static ThreadLocal localPool = new ThreadLocal();
private static BufferedReader dataInputStream = null;
static {
getHtml = new CharByCharSearch();
dataInputStream = FileReader.getFileContentsBR();
getHtml.grabHTMLLinksSearch();
localPool.set(mapForKeyValues);
}
public CharByCharSearch() {
}
public void grabHTMLLinksSearch() {
String html = "";
try {
long milliSeconds1 = System.currentTimeMillis();
long milliSeconds2 = 0l;
html = dataInputStream.readLine();
while (null != html) {
milliSeconds2 = System.currentTimeMillis();
String firstS = html.substring(
html.toLowerCase().indexOf("=") + 2, html.length());
mapForKeyValues.put(html.substring(0, html.indexOf("<") - 1)
.toLowerCase(), firstS.substring(0,
firstS.indexOf(" ") - 1));
html = dataInputStream.readLine();
}
System.out.println("time took to search the keyword@@@@ "
+ (milliSeconds2 - milliSeconds1));
} catch (Exception e) {
System.out.println("error when getting the data");
e.printStackTrace();
} finally {
try {
if (null != dataInputStream) {
dataInputStream.close();
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
public ArrayList<String> search(String searchWord) {
ArrayList<String> linkURLS = new ArrayList<String>();
String searchKey = searchWord.toLowerCase();
String[] searchKeyValues = searchKey.split(" ");
int len = searchKeyValues.length;
HashMap<String, String> hashMap = (HashMap<String, String>) localPool
.get();
Set<String> keys = hashMap.keySet();
for (String key : keys) {
int index = searchByChar.kmp(searchKey, key);
if (key.length() >= 10) {
if (-1 != index) {// rule 1 & 2
linkURLS.add(mapForKeyValues.get(key));
}
} else if (key.equalsIgnoreCase(searchKey)) {// rule 3
linkURLS.add(mapForKeyValues.get(key));
} else if (key.endsWith("_")) {// rule 5
if (-1 != index) {
linkURLS.add(mapForKeyValues.get(key));
}
} else if (len > 0) {// rule 4
for (int i = 0; i < len; i++) {
if (searchKeyValues[i].equalsIgnoreCase(key)) {
linkURLS.add(mapForKeyValues.get(key));
break;
}
}
}
}
return linkURLS;
}
public static void main(String[] args) {
ArrayList<String> linkURLS = getHtml
.search("My Name Is jay_ patil_00_.");
for (String value : linkURLS) {
System.out.println(value);
}
}
}
--------------------------------------------------------------------------
2. Second java file searchByChar.java
Code :
public class searchByChar {
public static int[] prekmp(String pattern) {
int[] next = new int[pattern.length()];
int i=0, j=-1;
next[0]=-1;
while (i<pattern.length()-1) {
while (j>=0 && pattern.charAt(i)!=pattern.charAt(j))
j = next[j];
i++;
j++;
next[i] = j;
}
return next;
}
public static int kmp(String text, String pattern) {
int[] next = prekmp(pattern);
int i=0, j=0;
while (i<text.length()) {
while (j>=0 && text.charAt(i)!=pattern.charAt(j))
j = next[j];
i++; j++;
if (j==pattern.length())
return i-pattern.length();
}
return -1;
}
}
--------------------------------------------------------------------------
The content of text file are like:
patil_00_ <A HREF="http://support.jay.com:8080/index.jsp" title="View the Supp" target=_blank class="table">patil_00_</A>
jay_ <A HREF="http://support.sac.com:8080/index.jsp" title="View the jsp" target=_blank class="link">jay_</A>
...........................
and the 3rd file FileReader.java read the text file using DataInputStream and return the dataInputStream
Re: Fastest way to read and search a string in a large file using core java
Assuming it looks like:
<key1> <value1> <key2> <value>... and IF search string is a substring of a key, return each of these results?
How are you hashing it right now? Are you hashing every substring possible?
Re: Fastest way to read and search a string in a large file using core java
Quote:
Originally Posted by
patilsn_jay
Hi All,
I want to read a file and search string in file and returns it's related value.
The file contains key and value pair and they are separated by space. Key is string and value is and URL in <a> tag, and the file size is 60MB.
I read the file using file Input stream and stored it into hash Map and performed searching into hash map, and it's taking less than a second to read the file and search a string and return it's value.
The searching in file having some rules as:
1. search string in file can be a sentence, we have to search the matching key into the file and return all the values(URL). and the search string length should be greater than 10.
2. if the search string contains _(underscore) at the end, then if the exact match(key) found in the file then no need to check the length of search string.
e.g. of Search Sting :
This is my first post to Java Programming Forums.
If the file contains Programming key, then we should have to return all URL related to this key.
I tried using file scanner, buffer Reader, File Input Stream and direct search in file and using hash map. But using hash map the performance was better.
Can please anybody suggest me how can I improve the performs so that I can achieve the file read and search within 5 Milliseconds. This will be an great help for me..
Thanks a lot in advance....
The code I have written to read the file is as mentioned below :-
Please suggest us the changes or any other better way so that we can improve the reading and search performance by less than 5 milliseconds.
1. File CharByCharSearch.java
Code :
import java.io.BufferedReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Set;
public class CharByCharSearch {
private static HashMap<String, String> mapForKeyValues = new HashMap<String, String>();
private static CharByCharSearch getHtml = null;
private static ThreadLocal localPool = new ThreadLocal();
private static BufferedReader dataInputStream = null;
static {
getHtml = new CharByCharSearch();
dataInputStream = FileReader.getFileContentsBR();
getHtml.grabHTMLLinksSearch();
localPool.set(mapForKeyValues);
}
public CharByCharSearch() {
}
public void grabHTMLLinksSearch() {
String html = "";
try {
long milliSeconds1 = System.currentTimeMillis();
long milliSeconds2 = 0l;
html = dataInputStream.readLine();
while (null != html) {
milliSeconds2 = System.currentTimeMillis();
String firstS = html.substring(
html.toLowerCase().indexOf("=") + 2, html.length());
mapForKeyValues.put(html.substring(0, html.indexOf("<") - 1)
.toLowerCase(), firstS.substring(0,
firstS.indexOf(" ") - 1));
html = dataInputStream.readLine();
}
System.out.println("time took to search the keyword@@@@ "
+ (milliSeconds2 - milliSeconds1));
} catch (Exception e) {
System.out.println("error when getting the data");
e.printStackTrace();
} finally {
try {
if (null != dataInputStream) {
dataInputStream.close();
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
public ArrayList<String> search(String searchWord) {
ArrayList<String> linkURLS = new ArrayList<String>();
String searchKey = searchWord.toLowerCase();
String[] searchKeyValues = searchKey.split(" ");
int len = searchKeyValues.length;
HashMap<String, String> hashMap = (HashMap<String, String>) localPool
.get();
Set<String> keys = hashMap.keySet();
for (String key : keys) {
int index = searchByChar.kmp(searchKey, key);
if (key.length() >= 10) {
if (-1 != index) {// rule 1 & 2
linkURLS.add(mapForKeyValues.get(key));
}
} else if (key.equalsIgnoreCase(searchKey)) {// rule 3
linkURLS.add(mapForKeyValues.get(key));
} else if (key.endsWith("_")) {// rule 5
if (-1 != index) {
linkURLS.add(mapForKeyValues.get(key));
}
} else if (len > 0) {// rule 4
for (int i = 0; i < len; i++) {
if (searchKeyValues[i].equalsIgnoreCase(key)) {
linkURLS.add(mapForKeyValues.get(key));
break;
}
}
}
}
return linkURLS;
}
public static void main(String[] args) {
ArrayList<String> linkURLS = getHtml
.search("My Name Is jay_ patil_00_.");
for (String value : linkURLS) {
System.out.println(value);
}
}
}
--------------------------------------------------------------------------
2. Second java file searchByChar.java
Code :
public class searchByChar {
public static int[] prekmp(String pattern) {
int[] next = new int[pattern.length()];
int i=0, j=-1;
next[0]=-1;
while (i<pattern.length()-1) {
while (j>=0 && pattern.charAt(i)!=pattern.charAt(j))
j = next[j];
i++;
j++;
next[i] = j;
}
return next;
}
public static int kmp(String text, String pattern) {
int[] next = prekmp(pattern);
int i=0, j=0;
while (i<text.length()) {
while (j>=0 && text.charAt(i)!=pattern.charAt(j))
j = next[j];
i++; j++;
if (j==pattern.length())
return i-pattern.length();
}
return -1;
}
}
--------------------------------------------------------------------------
The content of text file are like:
patil_00_ <A HREF="http://support.jay.com:8080/index.jsp" title="View the Supp" target=_blank class="table">patil_00_</A>
jay_ <A HREF="http://support.sac.com:8080/index.jsp" title="View the jsp" target=_blank class="link">jay_</A>
...........................
and the 3rd file FileReader.java read the text file using DataInputStream and return the dataInputStream
Re: Fastest way to read and search a string in a large file using core java
Quote:
Originally Posted by
concerto49
Assuming it looks like:
<key1> <value1> <key2> <value>... and IF search string is a substring of a key, return each of these results?
How are you hashing it right now? Are you hashing every substring possible?
The code I have written to read the file is as mentioned below :-
Please suggest us the changes or any other better way so that we can improve the reading and search performance by less than 5 milliseconds.
1. File CharByCharSearch.java
Code :
import java.io.BufferedReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Set;
public class CharByCharSearch {
private static HashMap<String, String> mapForKeyValues = new HashMap<String, String>();
private static CharByCharSearch getHtml = null;
private static ThreadLocal localPool = new ThreadLocal();
private static BufferedReader dataInputStream = null;
static {
getHtml = new CharByCharSearch();
dataInputStream = FileReader.getFileContentsBR();
getHtml.grabHTMLLinksSearch();
localPool.set(mapForKeyValues);
}
public CharByCharSearch() {
}
public void grabHTMLLinksSearch() {
String html = "";
try {
long milliSeconds1 = System.currentTimeMillis();
long milliSeconds2 = 0l;
html = dataInputStream.readLine();
while (null != html) {
milliSeconds2 = System.currentTimeMillis();
String firstS = html.substring(
html.toLowerCase().indexOf("=") + 2, html.length());
mapForKeyValues.put(html.substring(0, html.indexOf("<") - 1)
.toLowerCase(), firstS.substring(0,
firstS.indexOf(" ") - 1));
html = dataInputStream.readLine();
}
System.out.println("time took to search the keyword@@@@ "
+ (milliSeconds2 - milliSeconds1));
} catch (Exception e) {
System.out.println("error when getting the data");
e.printStackTrace();
} finally {
try {
if (null != dataInputStream) {
dataInputStream.close();
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
public ArrayList<String> search(String searchWord) {
ArrayList<String> linkURLS = new ArrayList<String>();
String searchKey = searchWord.toLowerCase();
String[] searchKeyValues = searchKey.split(" ");
int len = searchKeyValues.length;
HashMap<String, String> hashMap = (HashMap<String, String>) localPool
.get();
Set<String> keys = hashMap.keySet();
for (String key : keys) {
int index = searchByChar.kmp(searchKey, key);
if (key.length() >= 10) {
if (-1 != index) {// rule 1 & 2
linkURLS.add(mapForKeyValues.get(key));
}
} else if (key.equalsIgnoreCase(searchKey)) {// rule 3
linkURLS.add(mapForKeyValues.get(key));
} else if (key.endsWith("_")) {// rule 5
if (-1 != index) {
linkURLS.add(mapForKeyValues.get(key));
}
} else if (len > 0) {// rule 4
for (int i = 0; i < len; i++) {
if (searchKeyValues[i].equalsIgnoreCase(key)) {
linkURLS.add(mapForKeyValues.get(key));
break;
}
}
}
}
return linkURLS;
}
public static void main(String[] args) {
ArrayList<String> linkURLS = getHtml
.search("My Name Is jay_ patil_00_.");
for (String value : linkURLS) {
System.out.println(value);
}
}
}
--------------------------------------------------------------------------
2. Second java file searchByChar.java
Code :
public class searchByChar {
public static int[] prekmp(String pattern) {
int[] next = new int[pattern.length()];
int i=0, j=-1;
next[0]=-1;
while (i<pattern.length()-1) {
while (j>=0 && pattern.charAt(i)!=pattern.charAt(j))
j = next[j];
i++;
j++;
next[i] = j;
}
return next;
}
public static int kmp(String text, String pattern) {
int[] next = prekmp(pattern);
int i=0, j=0;
while (i<text.length()) {
while (j>=0 && text.charAt(i)!=pattern.charAt(j))
j = next[j];
i++; j++;
if (j==pattern.length())
return i-pattern.length();
}
return -1;
}
}
--------------------------------------------------------------------------
The content of text file are like:
patil_00_ <A HREF="http://support.jay.com:8080/index.jsp" title="View the Supp" target=_blank class="table">patil_00_</A>
jay_ <A HREF="http://support.sac.com:8080/index.jsp" title="View the jsp" target=_blank class="link">jay_</A>
...........................
and the 3rd file FileReader.java read the text file using DataInputStream and return the dataInputStream