instrumentation: add next-share/
[cs-p2p-next.git] / instrumentation / next-share / BaseLib / Core / Search / KeywordSearch.py
1 # written by Jelle Roozenburg
2 # see LICENSE.txt for license information
3
4 import re
5 import sys
6
7 DEBUG = False
8
9 class KeywordSearch:
10     """
11     Tribler keywordsearch now has the following features:
12     1. All items with one of the keywords in the 'name' field are returned (self.simpleSearch() )
13     2. The sorting of the results is based on:
14       a) The number of matching keywords
15       b) The length of the matching keywords
16       c) If the keywords matched a whole word (search for 'cat' find 'category')
17       (done in self.search() )
18     3. Searching is case insensitive
19     """
20     def search(self, haystack, needles, haystackismatching=False):
21         if DEBUG:
22             print >>sys.stderr,'kws: unprocessed keywords: %s' % needles
23         needles = self.unRegExpifySearchwords(needles)
24         if DEBUG:
25             print >>sys.stderr,'kws: Searching for %s in %d items' % (repr(needles), len(haystack))
26             
27         if not haystackismatching:
28             searchspace = self.simpleSearch(haystack, needles)
29             if DEBUG:
30                 print >>sys.stderr,'kws: Found %s items using simple search' % len(searchspace)
31         else:
32             searchspace = haystack
33         results = []
34         wbsearch = []
35         
36         for needle in needles:
37             wbsearch.append(re.compile(r'\b%s\b' % needle))
38                                               
39         for item in searchspace:
40             title = item['name'].lower()
41             score = 0
42             for i in xrange(len(needles)):
43                 wb = wbsearch[i].findall(title)
44                 score += len(wb) * 2 * len(needles[i])
45                 if len(wb) == 0:
46                     if title.find(needles[i].lower()) != -1:
47                         score += len(needles[i])
48
49             results.append((score, item))
50         
51         results.sort(reverse=True)
52         if DEBUG:
53             print >>sys.stderr,'kws: Found %d items eventually' % len(results)
54             #for r in results:
55             #    print r
56         return [r[1] for r in results]
57
58     
59     def unRegExpifySearchwords(self, needles):
60         replaceRegExpChars = re.compile(r'(\\|\*|\.|\+|\?|\||\(|\)|\[|\]|\{|\})')
61         new_needles = []
62         for needle in needles:
63             needle = needle.strip()
64             if len(needle)== 0:
65                 continue
66             new_needle = re.sub(replaceRegExpChars, r'\\\1', needle.lower())
67             new_needles.append(new_needle)
68         return new_needles
69             
70     def simpleSearch(self, haystack, needles, searchtype='AND'):
71         "Can do both OR or AND search"
72         hits = []
73         if searchtype == 'OR':
74             searchRegexp = r''
75             for needle in needles:
76                 searchRegexp+= needle+'|'
77             searchRegexp = re.compile(searchRegexp[:-1])
78             for item in haystack:
79                 title = item['name'].lower()
80                 if len(searchRegexp.findall(title)) > 0:
81                     hits.append(item)
82         elif searchtype == 'AND':
83             for item in haystack:
84                 title = item['name'].lower()
85                 foundAll = True
86                 for needle in needles:
87                     if title.find(needle) == -1:
88                         foundAll = False
89                         break
90                 if foundAll:
91                     hits.append(item)
92         return hits
93
94
95 def test():
96     data = [{'name':'Fedoras 3.10'},
97             {'name':'Fedora 2.10'},
98             {'name':'Movie 3.10'},
99             {'name':'fedora_2'},
100             {'name':'movie_theater.avi'}
101             ]
102     words = ['fedora', '1']
103     #print KeywordSearch().simpleSearch(data, words)
104     print KeywordSearch().search(data, words)
105 if __name__ == '__main__':
106     test()
107