From 12d2e57c55b93325b695401ae767700dcccb8637 Mon Sep 17 00:00:00 2001 From: "Gergely POLONKAI (W00d5t0ck)" Date: Mon, 9 Sep 2013 22:56:09 +0200 Subject: [PATCH] geodata.xml is now generated correctly Data is downloaded from geonames.org, and processed with an AWK and a Perl script. The result is the part of the distribution, so the average user (or a packager) doesn't have to download that much of data. --- .gitignore | 1 + TODO | 2 +- configure.ac | 3 ++ data/geonames/Makefile.am | 53 +++++++++++++++++++++++++++++- data/geonames/geonames_process.awk | 7 ++++ data/geonames/geonames_process.pl | 50 +++++++++++++++++----------- 6 files changed, 94 insertions(+), 22 deletions(-) create mode 100644 data/geonames/geonames_process.awk diff --git a/.gitignore b/.gitignore index d7f2dd8..d44d079 100644 --- a/.gitignore +++ b/.gitignore @@ -37,6 +37,7 @@ Makefile.in # Geonames related things /data/geonames/*.txt +/data/geonames/*.zip /data/geonames/geodata.xml # Translation related files diff --git a/TODO b/TODO index d9fab52..b164244 100644 --- a/TODO +++ b/TODO @@ -9,4 +9,4 @@ ** Hidden ascendent ** Vertex ** Symbols of Uranus and Pluto - +* Custom city database (or multiple databases) or adding custom locations diff --git a/configure.ac b/configure.ac index 98da19e..6042afa 100644 --- a/configure.ac +++ b/configure.ac @@ -13,6 +13,9 @@ AM_PROG_CC_C_O AM_PROG_AR AC_PROG_LIBTOOL AC_PATH_PROGS(PERL, [perl5 perl]) +AC_PATH_PROGS(WGET, [wget]) +AC_PATH_PROGS(CURL, [curl]) +AC_PATH_PROGS(UNZIP, [unzip]) have_geonames_perl_modules=no AX_PROG_PERL_MODULES([XML::Writer IO::File], [have_geonames_perl_modules=yes], AC_MSG_WARN([XML::Writer and IO::File perl modules are required if you want to regenerate geodata.xml!])) AC_SUBST([have_geonames_perl_modules]) diff --git a/data/geonames/Makefile.am b/data/geonames/Makefile.am index 0941a1a..4afe4b8 100644 --- a/data/geonames/Makefile.am +++ b/data/geonames/Makefile.am @@ -5,7 +5,58 @@ EXTRA_DIST = geodata.xml MAINTAINERCLEANFILES = geodata.xml -geodata.xml: +countryInfoURL = "http://download.geonames.org/export/dump/countryInfo.txt" +timeZonesURL = "http://download.geonames.org/export/dump/timeZones.txt" +allCountriesURL = "http://download.geonames.org/export/dump/allCountries.zip" + +countryInfo.txt: + @-if test -x $(WGET); then \ + $(WGET) $(countryInfoURL); \ + else \ + if test -x $(CURL); then \ + $(CURL) $(countryInfoURL) > "$@" ; \ + else \ + echo "wget and curl could not be found in your PATH."; \ + echo "One of them is needed to create geodata.xml!"; \ + fi; \ + fi + +timeZones.txt: + @-if test -x $(WGET); then \ + $(WGET) $(timeZonesURL); \ + else \ + if test -x $(CURL); then \ + $(CURL) $(timeZonesURL) > "$@" ; \ + else \ + echo "wget and curl could not be found in your PATH."; \ + echo "One of them is needed to create geodata.xml!"; \ + fi; \ + fi + +allCountries.zip: + @-if test -x $(WGET); then \ + $(WGET) $(allCountriesURL); \ + else \ + if test -x $(CURL); then \ + $(CURL) $(allCountriesURL) > "$@" ; \ + else \ + echo "wget and curl could not be found in your PATH."; \ + echo "One of them is needed to create geodata.xml!"; \ + fi; \ + fi; + +cities.txt: allCountries.zip + $(AM_V_GEN) if test -x $(UNZIP); then \ + $(UNZIP) allCountries.zip; \ + else \ + echo "unzip could not be found in your PATH."; \ + echo "It is needed to create geodata.xml!"; \ + fi; \ + if test -f allCountries.txt; then \ + $(AWK) -f geonames_process.awk allCountries.txt > $@ ; \ + fi + +geodata.xml: countryInfo.txt timeZones.txt cities.txt $(AM_V_GEN) if test -x "$(PERL)"; then \ if test "x$(have_geonames_perl_modules)" = "xyes" -o "x$(I_HAVE_PERL_MODULES)" = "xyes"; then \ $(PERL) geonames_process.pl; \ diff --git a/data/geonames/geonames_process.awk b/data/geonames/geonames_process.awk new file mode 100644 index 0000000..57f0cd6 --- /dev/null +++ b/data/geonames/geonames_process.awk @@ -0,0 +1,7 @@ +BEGIN { + FS="\t" +} +{ + if ($7 != "P" || $8 != "PPL" || $15 < 1000) next + print $9 FS $2 FS $5 FS $6 FS $16 FS $18 +} diff --git a/data/geonames/geonames_process.pl b/data/geonames/geonames_process.pl index 00c24fc..5311276 100644 --- a/data/geonames/geonames_process.pl +++ b/data/geonames/geonames_process.pl @@ -4,25 +4,22 @@ use strict; use IO::File; use XML::Writer; -# TODO: Download http://download.geonames.org/export/dump/countryInfo.txt -# TODO: Download http://download.geonames.org/export/dump/timeZones.txt -# TODO: Download http://download.geonames.org/export/dump/allCountries.zip -# TODO: Unzip allCountries.zip - my %time_zones = (); my %countries = (); open(TIMEZONES, 'timeZones.txt') or die("Cannot open timeZones.txt: $!\n"); while () { my ($country_code, $timezone_id, $gmt_offset_january, $gmt_offset_july, $gmt_offset_raw) = split(/\t/, $_); + next if ($country_code !~ /^[A-Z]{2}$/); - $time_zones{$country_code . '_' . $timezone_id} = {offset => $gmt_offset_january, dst_offset => $gmt_offset_july}; + $time_zones{$timezone_id} = {offset => $gmt_offset_january, dst_offset => $gmt_offset_july}; } close(TIMEZONES); open(COUNTRIES, 'countryInfo.txt') or die("Cannot open countryInfo.txt: $!\n"); while () { my ($country_code, $iso3, $iso_numeric, $fips, $name, $capital, $area, $population, $continent, $tld, $currency_code, $currency_name, $phone, $postal_code_format, $postal_code_regex, $languages, $geonameid, $neighbours, $equivalent_fips_code) = split(/\t/, $_); + next if ($country_code !~ /^[A-Z]{2}$/); if ($country_code =~ /^[A-Z]{2}$/) { $countries{$country_code} = $name; @@ -30,7 +27,7 @@ while () { } close(COUNTRIES); -open(GEONAMES, "HU.txt") or die("Cannot open HU.txt: $!\n"); +open(GEONAMES, "cities.txt") or die("Cannot open cities.txt: $!\n"); my $xml_file = IO::File->new('>geodata.xml'); my $writer = XML::Writer->new(OUTPUT => $xml_file, NEWLINES => 0); @@ -38,23 +35,36 @@ my $writer = XML::Writer->new(OUTPUT => $xml_file, NEWLINES => 0); $writer->xmlDecl('utf-8'); $writer->startTag('geodata'); -# TODO: process all files, not just HU.txt! while () { - my ($geonameid, $name, $asciiname, $alternatenames, $latitude, $longitude, $feature_class, $feature_code, $country_code, $alt_country_code, $admin1, $admin2, $admin3, $admin4, $population, $elevation, $dem, $timezone, $mod_date) = split(/\t/, $_); + chomp($_); + my ($country_code, $name, $latitude, $longitude, $elevation, $timezone) = split(/\t/, $_); - if (($feature_class eq 'P') && ($feature_code eq 'PPL')) { - $writer->emptyTag('place', - 'name' => $name, - 'latitude' => $latitude, - 'longitude' => $longitude, - 'elevation' => $elevation, - 'country' => $countries{$country_code}, - 'time_offset' => $time_zones{$country_code . '_' . $timezone}->{offset}, - 'time_offset_dst' => $time_zones{$country_code . '_' . $timezone}->{dst_offset} - ); + if (!exists($countries{$country_code})) { + print "Unknown country code: $country_code\n"; + next; } + + if (!exists($time_zones{$timezone})) { + print "Unknown time zone: $timezone\n"; + next; + } + + $writer->emptyTag('p', + 'n' => $name, + 'lat' => $latitude, + 'lon' => $longitude, + 'alt' => $elevation, + 'c' => $country_code, + 'tzo' => $time_zones{$timezone}->{offset}, + 'tzd' => $time_zones{$timezone}->{dst_offset} + ); + + print $., "\n" if ($. % 19083 == 0); } -close GEONAMES; $writer->endTag('geodata'); +$writer->end(); +$xml_file->close(); + +close GEONAMES;