Comparaison des performances
Remarque : le temps indiqué dans le tableau ne comprend pas le temps nécessaire au chargement des fichiers.
| approach | american-english, | big.txt, | time w.r.t. defaultdict |
| | time, seconds | time, seconds | |
|----------------+-------------------+---------------+-------------------------|
| Counter | 0.451 | 3.367 | 3.6 |
| setdefault | 0.348 | 2.320 | 2.5 |
| list | 0.277 | 1.822 | 2 |
| try/except | 0.158 | 1.068 | 1.2 |
| defaultdict | 0.141 | 0.925 | 1 |
| numpy | 0.012 | 0.076 | 0.082 |
| S.Mark's ext. | 0.003 | 0.019 | 0.021 |
| ext. in Cython | 0.001 | 0.008 | 0.0086 |
#+TBLFM: $4=$3/@7$3;%.2g
Les fichiers utilisés : '/usr/share/dict/american-english'
y 'big.txt'
.
Le script qui compare les solutions de 'Counter', 'setdefault', 'list', 'try/except', 'defaultdict', 'numpy', 'cython' et @S.Mark se trouve à l'adresse suivante http://gist.github.com/347000
La solution la plus rapide est l'extension Python écrite en Cython :
import cython
@cython.locals(
chars=unicode,
i=cython.Py_ssize_t,
L=cython.Py_ssize_t[0x10000])
def countchars_cython(chars):
for i in range(0x10000): # unicode code points > 0xffff are not supported
L[i] = 0
for c in chars:
L[c] += 1
return {unichr(i): L[i] for i in range(0x10000) if L[i]}
Comparaison précédente :
* python (dict) : 0.5 seconds
* python (list) : 0.5 (ascii) (0.2 if read whole file in memory)
* perl : 0.5
* python (numpy): 0.07
* c++ : 0.05
* c : 0.008 (ascii)
Données d'entrée :
$ tail /usr/share/dict/american-english
éclat's
élan
élan's
émigré
émigrés
épée
épées
étude
étude's
études
$ du -h /usr/share/dict/american-english
912K /usr/share/dict/american-english
python (compteur) : 0.5 secondes
#!/usr/bin/env python3.1
import collections, fileinput, textwrap
chars = (ch for word in fileinput.input() for ch in word.rstrip())
# faster (0.4s) but less flexible: chars = open(filename).read()
print(textwrap.fill(str(collections.Counter(chars)), width=79))
Exécutez-le :
$ time -p python3.1 count_char.py /usr/share/dict/american-english
Counter({'e': 87823, 's': 86620, 'i': 66548, 'a': 62778, 'n': 56696, 'r':
56286, 't': 51588, 'o': 48425, 'l': 39914, 'c': 30020, 'd': 28068, 'u': 25810,
"'": 24511, 'g': 22262, 'p': 20917, 'm': 20747, 'h': 18453, 'b': 14137, 'y':
12367, 'f': 10049, 'k': 7800, 'v': 7573, 'w': 6924, 'z': 3088, 'x': 2082, 'M':
1686, 'C': 1549, 'S': 1515, 'q': 1447, 'B': 1387, 'j': 1376, 'A': 1345, 'P':
974, 'L': 912, 'H': 860, 'T': 858, 'G': 811, 'D': 809, 'R': 749, 'K': 656, 'E':
618, 'J': 539, 'N': 531, 'W': 507, 'F': 502, 'O': 354, 'I': 344, 'V': 330, 'Z':
150, 'Y': 140, 'é': 128, 'U': 117, 'Q': 63, 'X': 42, 'è': 29, 'ö': 12, 'ü': 12,
'ó': 10, 'á': 10, 'ä': 7, 'ê': 6, 'â': 6, 'ñ': 6, 'ç': 4, 'å': 3, 'û': 3, 'í':
2, 'ô': 2, 'Å': 1})
real 0.44
user 0.43
sys 0.01
perl : 0.5 secondes
time -p perl -MData::Dumper -F'' -lanwe'$c{$_}++ for (@F);
END{ $Data::Dumper::Terse = 1; $Data::Dumper::Indent = 0; print Dumper(\%c) }
' /usr/share/dict/american-english
Sortie :
{'S' => 1515,'K' => 656,'' => 29,'d' => 28068,'Y' => 140,'E' => 618,'y' => 12367,'g' => 22262,'e' => 87823,'' => 2,'J' => 539,'' => 241,'' => 3,'' => 6,'' => 4,'' => 128,'D' => 809,'q' => 1447,'b' => 14137,'z' => 3088,'w' => 6924,'Q' => 63,'' => 10,'M' => 1686,'C' => 1549,'' => 10,'L' => 912,'X' => 42,'P' => 974,'' => 12,'\\'' => 24511,'' => 6,'a' => 62778,'T' => 858,'N' => 531,'j' => 1376,'Z' => 150,'u' => 25810,'k' => 7800,'t' => 51588,'' => 6,'W' => 507,'v' => 7573,'s' => 86620,'B' => 1387,'H' => 860,'c' => 30020,'' => 12,'I' => 344,'' => 3,'G' => 811,'U' => 117,'F' => 502,'' => 2,'r' => 56286,'x' => 2082,'V' => 330,'h' => 18453,'f' => 10049,'' => 1,'i' => 66548,'A' => 1345,'O' => 354,'n' => 56696,'m' => 20747,'l' => 39914,'' => 7,'p' => 20917,'R' => 749,'o' => 48425}
real 0.51
user 0.49
sys 0.02
python (numpy) : 0.07 secondes
Sur la base de La réponse de Fourmis Aasma (modifié pour supporter l'unicode) :
#!/usr/bin/env python
import codecs, itertools, operator, sys
import numpy
filename = sys.argv[1] if len(sys.argv)>1 else '/usr/share/dict/american-english'
# ucs2 or ucs4 python?
dtype = {2: numpy.uint16, 4: numpy.uint32}[len(buffer(u"u"))]
# count ordinals
text = codecs.open(filename, encoding='utf-8').read()
a = numpy.frombuffer(text, dtype=dtype)
counts = numpy.bincount(a)
# pretty print
counts = [(unichr(i), v) for i, v in enumerate(counts) if v]
counts.sort(key=operator.itemgetter(1))
print ' '.join('("%s" %d)' % c for c in counts if c[0] not in ' \t\n')
Sortie :
("Å" 1) ("í" 2) ("ô" 2) ("å" 3) ("û" 3) ("ç" 4) ("â" 6) ("ê" 6) ("ñ" 6) ("ä" 7) ("á" 10) ("ó" 10) ("ö" 12) ("ü" 12) ("è" 29) ("X" 42) ("Q" 63) ("U" 117) ("é" 128) ("Y" 140) ("Z" 150) ("V" 330) ("I" 344) ("O" 354) ("F" 502) ("W" 507) ("N" 531) ("J" 539) ("E" 618) ("K" 656) ("R" 749) ("D" 809) ("G" 811) ("T" 858) ("H" 860) ("L" 912) ("P" 974) ("A" 1345) ("j" 1376) ("B" 1387) ("q" 1447) ("S" 1515) ("C" 1549) ("M" 1686) ("x" 2082) ("z" 3088) ("w" 6924) ("v" 7573) ("k" 7800) ("f" 10049) ("y" 12367) ("b" 14137) ("h" 18453) ("m" 20747) ("p" 20917) ("g" 22262) ("'" 24511) ("u" 25810) ("d" 28068) ("c" 30020) ("l" 39914) ("o" 48425) ("t" 51588) ("r" 56286) ("n" 56696) ("a" 62778) ("i" 66548) ("s" 86620) ("e" 87823)
real 0.07
user 0.06
sys 0.01
c++ : 0,05 seconde
// $ g++ *.cc -lboost_program_options
// $ ./a.out /usr/share/dict/american-english
#include <iostream>
#include <fstream>
#include <cstdlib> // exit
#include <boost/program_options/detail/utf8_codecvt_facet.hpp>
#include <boost/tr1/unordered_map.hpp>
#include <boost/foreach.hpp>
int main(int argc, char* argv[]) {
using namespace std;
// open input file
if (argc != 2) {
cerr << "Usage: " << argv[0] << " <filename>\n";
exit(2);
}
wifstream f(argv[argc-1]);
// assume the file has utf-8 encoding
locale utf8_locale(locale(""),
new boost::program_options::detail::utf8_codecvt_facet);
f.imbue(utf8_locale);
// count characters frequencies
typedef std::tr1::unordered_map<wchar_t, size_t> hashtable_t;
hashtable_t counts;
for (wchar_t ch; f >> ch; )
counts[ch]++;
// print result
wofstream of("output.utf8");
of.imbue(utf8_locale);
BOOST_FOREACH(hashtable_t::value_type i, counts)
of << "(" << i.first << " " << i.second << ") ";
of << endl;
}
Résultat :
$ cat output.utf8
(í 2) (O 354) (P 974) (Q 63) (R 749) (S 1,515) (ñ 6) (T 858) (U 117) (ó 10) (ô 2) (V 330) (W 507) (X 42) (ö 12) (Y 140) (Z 150) (û 3) (ü 12) (a 62,778) (b 14,137) (c 30,020) (d 28,068) (e 87,823) (f 10,049) (g 22,262) (h 18,453) (i 66,548) (j 1,376) (k 7,800) (l 39,914) (m 20,747) (n 56,696) (o 48,425) (p 20,917) (q 1,447) (r 56,286) (s 86,620) (t 51,588) (u 25,810) (Å 1) (' 24,511) (v 7,573) (w 6,924) (x 2,082) (y 12,367) (z 3,088) (A 1,345) (B 1,387) (C 1,549) (á 10) (â 6) (D 809) (E 618) (F 502) (ä 7) (å 3) (G 811) (H 860) (ç 4) (I 344) (J 539) (è 29) (K 656) (é 128) (ê 6) (L 912) (M 1,686) (N 531)
c (ascii) : 0,0079 seconde
// $ gcc -O3 cc_ascii.c -o cc_ascii && time -p ./cc_ascii < input.txt
#include <stdio.h>
enum { N = 256 };
size_t counts[N];
int main(void) {
// count characters
int ch = -1;
while((ch = getchar()) != EOF)
++counts[ch];
// print result
size_t i = 0;
for (; i < N; ++i)
if (counts[i])
printf("('%c' %zu) ", (int)i, counts[i]);
return 0;
}