jadmanski | 14ad396 | 2008-06-06 15:54:34 +0000 | [diff] [blame] | 1 | import re,string |
| 2 | |
| 3 | |
| 4 | class reason_counter: |
jadmanski | 0afbb63 | 2008-06-06 21:10:57 +0000 | [diff] [blame] | 5 | def __init__(self, wording): |
| 6 | self.wording = wording |
| 7 | self.num = 1 |
jadmanski | 14ad396 | 2008-06-06 15:54:34 +0000 | [diff] [blame] | 8 | |
jadmanski | 0afbb63 | 2008-06-06 21:10:57 +0000 | [diff] [blame] | 9 | def update(self, new_wording): |
| 10 | self.num += 1 |
| 11 | self.wording = new_wording |
| 12 | |
| 13 | def html(self): |
| 14 | if self.num == 1: |
| 15 | return self.wording |
| 16 | else: |
| 17 | return "%s (%d+)" % (self.wording, self.num) |
jadmanski | 14ad396 | 2008-06-06 15:54:34 +0000 | [diff] [blame] | 18 | |
| 19 | |
| 20 | def numbers_are_irrelevant(txt): |
jadmanski | 0afbb63 | 2008-06-06 21:10:57 +0000 | [diff] [blame] | 21 | ## ? when do we replace numbers with NN ? |
| 22 | ## By default is always, but |
| 23 | ## if/when some categories of reasons choose to keep their numbers, |
| 24 | ## then the function shall return False for such categories |
| 25 | return True |
jadmanski | 14ad396 | 2008-06-06 15:54:34 +0000 | [diff] [blame] | 26 | |
| 27 | |
| 28 | def aggregate_reason_fields(reasons_list): |
jadmanski | 0afbb63 | 2008-06-06 21:10:57 +0000 | [diff] [blame] | 29 | # each reason in the list may be a combination |
| 30 | # of | - separated reasons. |
| 31 | # expand into list |
| 32 | reasons_txt = '|'.join(reasons_list) |
| 33 | reasons = reasons_txt.split('|') |
| 34 | reason_htable = {} |
| 35 | for reason in reasons: |
| 36 | reason_reduced = reason.strip() |
| 37 | ## reduce whitespaces |
| 38 | reason_reduced = re.sub(r"\s+"," ", reason_reduced) |
jadmanski | 14ad396 | 2008-06-06 15:54:34 +0000 | [diff] [blame] | 39 | |
jadmanski | 0afbb63 | 2008-06-06 21:10:57 +0000 | [diff] [blame] | 40 | if reason_reduced == '': |
| 41 | continue # ignore empty reasons |
jadmanski | 14ad396 | 2008-06-06 15:54:34 +0000 | [diff] [blame] | 42 | |
jadmanski | 0afbb63 | 2008-06-06 21:10:57 +0000 | [diff] [blame] | 43 | if numbers_are_irrelevant(reason_reduced): |
| 44 | # reduce numbers included into reason descriptor |
| 45 | # by replacing them with generic NN |
| 46 | reason_reduced = re.sub(r"\d+","NN", reason_reduced) |
jadmanski | 14ad396 | 2008-06-06 15:54:34 +0000 | [diff] [blame] | 47 | |
jadmanski | 0afbb63 | 2008-06-06 21:10:57 +0000 | [diff] [blame] | 48 | if not reason_reduced in reason_htable: |
| 49 | reason_htable[reason_reduced] = reason_counter(reason) |
| 50 | else: |
| 51 | ## reason_counter keeps original ( non reduced ) |
| 52 | ## reason if it occured once |
| 53 | ## if reason occured more then once, reason_counter |
| 54 | ## will keep it in reduced/generalized form |
| 55 | reason_htable[reason_reduced].update(reason_reduced) |
jadmanski | 14ad396 | 2008-06-06 15:54:34 +0000 | [diff] [blame] | 56 | |
jadmanski | 0afbb63 | 2008-06-06 21:10:57 +0000 | [diff] [blame] | 57 | generic_reasons = reason_htable.keys() |
| 58 | generic_reasons.sort(key = (lambda k: reason_htable[k].num), |
| 59 | reverse = True) |
| 60 | return map(lambda generic_reason: reason_htable[generic_reason].html(), |
| 61 | generic_reasons) |