* Correct Sniffer doc to correspond to the implementation.
* Add optional delimiters arg to Sniffer.sniff() which restricts the set of
candidate field delimiters.
diff --git a/Doc/lib/libcsv.tex b/Doc/lib/libcsv.tex
index f30aefe..a287ba8 100644
--- a/Doc/lib/libcsv.tex
+++ b/Doc/lib/libcsv.tex
@@ -152,17 +152,17 @@
\class{reader} or \class{writer} instance.
\end{classdesc*}
-\begin{classdesc}{Sniffer}{\optional{sample=16384}}
-The \class{Sniffer} class is used to deduce the format of a CSV file. The
-optional \var{sample} argument to the constructor specifies the number of
-bytes to use when determining Dialect parameters.
+\begin{classdesc}{Sniffer}{}
+The \class{Sniffer} class is used to deduce the format of a CSV file.
\end{classdesc}
The \class{Sniffer} class provides a single method:
-\begin{methoddesc}{sniff}{fileobj}
-Analyze the next chunk of \var{fileobj} and return a \class{Dialect} subclass
-reflecting the parameters found.
+\begin{methoddesc}{sniff}{sample\optional{,delimiters=None}}
+Analyze the given \var{sample} and return a \class{Dialect} subclass
+reflecting the parameters found. If the optional \var{delimiters} parameter
+is given, it is interpreted as a string containing possible valid delimiter
+characters.
\end{methoddesc}
\begin{methoddesc}{has_header}{sample}
diff --git a/Lib/csv.py b/Lib/csv.py
index 7e297b6..83b8aa4 100644
--- a/Lib/csv.py
+++ b/Lib/csv.py
@@ -159,15 +159,16 @@
self.preferred = [',', '\t', ';', ' ', ':']
- def sniff(self, sample):
+ def sniff(self, sample, delimiters=None):
"""
Returns a dialect (or None) corresponding to the sample
"""
quotechar, delimiter, skipinitialspace = \
- self._guess_quote_and_delimiter(sample)
+ self._guess_quote_and_delimiter(sample, delimiters)
if delimiter is None:
- delimiter, skipinitialspace = self._guess_delimiter(sample)
+ delimiter, skipinitialspace = self._guess_delimiter(sample,
+ delimiters)
class dialect(Dialect):
_name = "sniffed"
@@ -184,7 +185,7 @@
return dialect
- def _guess_quote_and_delimiter(self, data):
+ def _guess_quote_and_delimiter(self, data, delimiters):
"""
Looks for text enclosed between two identical quotes
(the probable quotechar) which are preceded and followed
@@ -222,7 +223,7 @@
key = m[n]
except KeyError:
continue
- if key:
+ if key and (delimiters is None or key in delimiters):
delims[key] = delims.get(key, 0) + 1
try:
n = regexp.groupindex['space'] - 1
@@ -248,7 +249,7 @@
return (quotechar, delim, skipinitialspace)
- def _guess_delimiter(self, data):
+ def _guess_delimiter(self, data, delimiters):
"""
The delimiter /should/ occur the same number of times on
each row. However, due to malformed data, it may not. We don't want
@@ -316,7 +317,8 @@
while len(delims) == 0 and consistency >= threshold:
for k, v in modeList:
if v[0] > 0 and v[1] > 0:
- if (v[1]/total) >= consistency:
+ if ((v[1]/total) >= consistency and
+ (delimiters is None or k in delimiters)):
delims[k] = v
consistency -= 0.01
diff --git a/Lib/test/test_csv.py b/Lib/test/test_csv.py
index c0ad645..00ba8cd 100644
--- a/Lib/test/test_csv.py
+++ b/Lib/test/test_csv.py
@@ -551,6 +551,12 @@
header = '''\
"venue","city","state","date","performers"
'''
+ sample3 = '''\
+05/05/03?05/05/03?05/05/03?05/05/03?05/05/03?05/05/03
+05/05/03?05/05/03?05/05/03?05/05/03?05/05/03?05/05/03
+05/05/03?05/05/03?05/05/03?05/05/03?05/05/03?05/05/03
+'''
+
def test_has_header(self):
sniffer = csv.Sniffer()
self.assertEqual(sniffer.has_header(self.sample1), False)
@@ -568,6 +574,15 @@
self.assertEqual(dialect.quotechar, "'")
self.assertEqual(dialect.skipinitialspace, False)
+ def test_delimiters(self):
+ sniffer = csv.Sniffer()
+ dialect = sniffer.sniff(self.sample3)
+ self.assertEqual(dialect.delimiter, "0")
+ dialect = sniffer.sniff(self.sample3, delimiters="?,")
+ self.assertEqual(dialect.delimiter, "?")
+ dialect = sniffer.sniff(self.sample3, delimiters="/,")
+ self.assertEqual(dialect.delimiter, "/")
+
if not hasattr(sys, "gettotalrefcount"):
if test_support.verbose: print "*** skipping leakage tests ***"
else: