You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

187 lines
8.2 KiB

  1. # -*- encoding: utf-8 -*-
  2. ##############################################################################
  3. #
  4. # Copyright (C) 2009 EduSense BV (<http://www.edusense.nl>).
  5. # All Rights Reserved
  6. #
  7. # This program is free software: you can redistribute it and/or modify
  8. # it under the terms of the GNU Affero General Public License as published
  9. # by the Free Software Foundation, either version 3 of the License, or
  10. # (at your option) any later version.
  11. #
  12. # This program is distributed in the hope that it will be useful,
  13. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. # GNU Affero General Public License for more details.
  16. #
  17. # You should have received a copy of the GNU Affero General Public License
  18. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  19. #
  20. ##############################################################################
  21. '''
  22. This module provides a utility class to extract postal codes from address
  23. strings.
  24. '''
  25. import re
  26. __all__ = ['split', 'get', 'PostalCode']
  27. class PostalCode(object):
  28. '''
  29. The PostalCode class is a wrapper around PostCodeFormat and an internal
  30. database of postalcode formats. It provides the class methods split() and
  31. get(), both of which must be called with the two character iso country
  32. code as first parameter.
  33. '''
  34. class PostalCodeFormat(object):
  35. '''
  36. Utility class of PostalCode.
  37. Allows finding and splitting of postalcode in strings
  38. '''
  39. def __init__(self, format):
  40. '''
  41. Create regexp patterns for matching
  42. '''
  43. # Sort formats on length, longest first
  44. formats = [(len(x), x) for x in format.split('|')]
  45. formats = [x[1] for x in sorted(formats, lambda x, y: -cmp(x, y))]
  46. self.res = [re.compile(x.replace('#', '\\d').replace('@', '[A-Z]'))
  47. for x in formats
  48. ]
  49. def get(self, str_):
  50. '''
  51. Return the postal code from the string str_
  52. '''
  53. for re_ in self.res:
  54. retval = re_.findall(str_)
  55. if retval:
  56. break
  57. return retval and retval[0] or ''
  58. def split(self, str_):
  59. '''
  60. Split str_ into (postalcode, remainder)
  61. '''
  62. for re_ in self.res:
  63. pos = re_.search(str_)
  64. if pos:
  65. break
  66. if pos:
  67. return (pos.group(), str_[pos.end():])
  68. return ('', str_)
  69. _formats = {
  70. 'AF': '', 'AX': '', 'AL': '', 'DZ': '#####', 'AS': '', 'AD': 'AD###',
  71. 'AO': '', 'AI': '', 'AQ': '', 'AG': '', 'AR': '@####@@@',
  72. 'AM': '######', 'AW': '', 'AU': '####', 'AT': '####', 'AZ': 'AZ ####',
  73. 'BS': '', 'BH': '####|###', 'BD': '####', 'BB': 'BB#####',
  74. 'BY': '######', 'BE': '####', 'BZ': '', 'BJ': '', 'BM': '@@ ##',
  75. 'BT': '', 'BO': '', 'BA': '#####', 'BW': '', 'BV': '',
  76. 'BR': '#####-###', 'IO': '', 'BN': '@@####', 'BG': '####', 'BF': '',
  77. 'BI': '', 'KH': '#####', 'CM': '', 'CA': '@#@ #@#', 'CV': '####',
  78. 'KY': '', 'CF': '', 'TD': '', 'CL': '#######', 'CN': '######',
  79. 'CX': '####', 'CC': '', 'CO': '', 'KM': '', 'CG': '', 'CD': '',
  80. 'CK': '', 'CR': '####', 'CI': '', 'HR': 'HR-#####', 'CU': 'CP #####',
  81. 'CY': '####', 'CZ': '### ##', 'DK': '####', 'DJ': '', 'DM': '',
  82. 'DO': '#####', 'EC': '@####@', 'EG': '#####', 'SV': 'CP ####',
  83. 'GQ': '', 'ER': '', 'EE': '#####', 'ET': '####', 'FK': '',
  84. 'FO': 'FO-###', 'FJ': '', 'FI': 'FI-#####', 'FR': '#####',
  85. 'GF': '#####', 'PF': '#####', 'TF': '', 'GA': '', 'GM': '',
  86. 'GE': '####', 'DE': '#####', 'GH': '', 'GI': '', 'GR': '### ##',
  87. 'GL': '####', 'GD': '', 'GP': '#####', 'GU': '969##', 'GT': '#####',
  88. 'GG': '@# #@@|@## #@@|@@# #@@|@@## #@@|@#@ #@@|@@#@ #@@|GIR0AA',
  89. 'GN': '', 'GW': '####', 'GY': '', 'HT': 'HT####', 'HM': '', 'VA': '',
  90. 'HN': '@@####', 'HK': '', 'HU': '####', 'IS': '###', 'IN': '######',
  91. 'ID': '#####', 'IR': '##########', 'IQ': '#####', 'IE': '',
  92. 'IM': '@# #@@|@## #@@|@@# #@@|@@## #@@|@#@ #@@|@@#@ #@@|GIR0AA',
  93. 'IL': '#####', 'IT': '####', 'JM': '', 'JP': '###-####',
  94. 'JE': '@# #@@|@## #@@|@@# #@@|@@## #@@|@#@ #@@|@@#@ #@@|GIR0AA',
  95. 'JO': '#####', 'KZ': '######', 'KE': '#####', 'KI': '',
  96. 'KP': '###-###',
  97. 'KR': 'SEOUL ###-###', 'KW': '#####', 'KG': '######', 'LA': '#####',
  98. 'LV': 'LV-####', 'LB': '#### ####|####', 'LS': '###', 'LR': '####',
  99. 'LY': '', 'LI': '####', 'LT': 'LT-#####', 'LU': '####', 'MO': '',
  100. 'MK': '####', 'MG': '###', 'MW': '', 'MY': '#####', 'MV': '#####',
  101. 'ML': '', 'MT': '@@@ ###|@@@ ##', 'MH': '', 'MQ': '#####', 'MR': '',
  102. 'MU': '', 'YT': '#####', 'MX': '#####', 'FM': '#####', 'MD': 'MD-####',
  103. 'MC': '#####', 'MN': '######', 'ME': '#####', 'MS': '', 'MA': '#####',
  104. 'MZ': '####', 'MM': '#####', 'NA': '', 'NR': '', 'NP': '#####',
  105. 'NL': '#### @@', 'AN': '', 'NC': '#####', 'NZ': '####',
  106. 'NI': '###-###-#', 'NE': '####', 'NG': '######', 'NU': '', 'NF': '',
  107. 'MP': '', 'NO': '####', 'OM': '###', 'PK': '#####', 'PW': '96940',
  108. 'PS': '', 'PA': '', 'PG': '###', 'PY': '####', 'PE': '', 'PH': '####',
  109. 'PN': '', 'PL': '##-###', 'PT': '####-###', 'PR': '#####-####',
  110. 'QA': '', 'RE': '#####', 'RO': '######', 'RU': '######', 'RW': '',
  111. 'BL': '### ###', 'SH': 'STHL 1ZZ', 'KN': '', 'LC': '', 'MF': '### ###',
  112. 'PM': '', 'VC': '', 'WS': '', 'SM': '4789#', 'ST': '', 'SA': '#####',
  113. 'SN': '#####', 'RS': '######', 'SC': '', 'SL': '', 'SG': '######',
  114. 'SK': '### ##', 'SI': 'SI- ####', 'SB': '', 'SO': '@@ #####',
  115. 'ZA': '####', 'GS': '', 'ES': '#####', 'LK': '#####', 'SD': '#####',
  116. 'SR': '', 'SJ': '', 'SZ': '@###', 'SE': 'SE-### ##', 'CH': '####',
  117. 'SY': '', 'TW': '#####', 'TJ': '######', 'TZ': '', 'TH': '#####',
  118. 'TL': '', 'TG': '', 'TK': '', 'TO': '', 'TT': '', 'TN': '####',
  119. 'TR': '#####', 'TM': '######', 'TC': 'TKCA 1ZZ', 'TV': '', 'UG': '',
  120. 'UA': '#####', 'AE': '',
  121. 'GB': '@# #@@|@## #@@|@@# #@@|@@## #@@|@#@ #@@|@@#@ #@@|GIR0AA',
  122. 'US': '#####-####', 'UM': '', 'UY': '#####', 'UZ': '######', 'VU': '',
  123. 'VE': '####', 'VN': '######', 'VG': '', 'VI': '', 'WF': '', 'EH': '',
  124. 'YE': '', 'ZM': '#####', 'ZW': ''
  125. }
  126. for iso, formatstr in _formats.iteritems():
  127. _formats[iso] = PostalCodeFormat(formatstr)
  128. @classmethod
  129. def split(cls, str_, iso=''):
  130. '''
  131. Split string <str_> in (postalcode, remainder) following the specs of
  132. country <iso>.
  133. Returns iso, postal code and the remaining part of <str_>.
  134. When iso is filled but postal code remains empty, no postal code could
  135. be found according to the rules of iso.
  136. When iso is empty but postal code is not, a proximity match was
  137. made where multiple hits gave similar results. A postal code is
  138. likely, but a unique iso code could not be identified.
  139. When neither iso or postal code are filled, no proximity match could
  140. be made.
  141. '''
  142. if iso in cls._formats:
  143. return (iso,) + tuple(cls._formats[iso].split(str_))
  144. # Find optimum (= max length postalcode) when iso code is unknown
  145. all = {}
  146. max_l = 0
  147. for key in cls._formats.iterkeys():
  148. i, p, c = cls.split(str_, key)
  149. l = len(p)
  150. if l > max_l:
  151. max_l = l
  152. if l in all:
  153. all[l].append((i, p, c))
  154. else:
  155. all[l] = [(i, p, c)]
  156. if max_l > 0:
  157. if len(all[max_l]) > 1:
  158. return ('',) + all[max_l][0][1:]
  159. return all[max_l][0]
  160. return ('', '', str_)
  161. @classmethod
  162. def get(cls, iso, str_):
  163. '''
  164. Extracts the postal code from str_ following the specs of country
  165. <iso>.
  166. '''
  167. if iso in cls._formats:
  168. return cls._formats[iso].get(str_)
  169. return ''
  170. get = PostalCode.get
  171. split = PostalCode.split