1080a9895SYuri Pankov#!/usr/bin/perl -wC
2080a9895SYuri Pankov
3080a9895SYuri Pankov#
4080a9895SYuri Pankov# Copyright 2009 Edwin Groothuis <edwin@FreeBSD.org>
5080a9895SYuri Pankov# Copyright 2015 John Marino <draco@marino.st>
6080a9895SYuri Pankov#
7080a9895SYuri Pankov# Redistribution and use in source and binary forms, with or without
8080a9895SYuri Pankov# modification, are permitted provided that the following conditions
9080a9895SYuri Pankov# are met:
10080a9895SYuri Pankov# 1. Redistributions of source code must retain the above copyright
11080a9895SYuri Pankov#    notice, this list of conditions and the following disclaimer.
12080a9895SYuri Pankov# 2. Redistributions in binary form must reproduce the above copyright
13080a9895SYuri Pankov#    notice, this list of conditions and the following disclaimer in the
14080a9895SYuri Pankov#    documentation and/or other materials provided with the distribution.
15080a9895SYuri Pankov#
16080a9895SYuri Pankov# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17080a9895SYuri Pankov# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18080a9895SYuri Pankov# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19080a9895SYuri Pankov# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20080a9895SYuri Pankov# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21080a9895SYuri Pankov# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22080a9895SYuri Pankov# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23080a9895SYuri Pankov# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24080a9895SYuri Pankov# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25080a9895SYuri Pankov# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26080a9895SYuri Pankov# SUCH DAMAGE.
27080a9895SYuri Pankov#
28080a9895SYuri Pankov
29080a9895SYuri Pankovuse strict;
30080a9895SYuri Pankovuse Getopt::Long;
31080a9895SYuri Pankov
32080a9895SYuri Pankovif ($#ARGV != 0) {
33080a9895SYuri Pankov	print "Usage: $0 --unidata=</path/to/UnicodeData.txt>\n";
34080a9895SYuri Pankov	exit(1);
35080a9895SYuri Pankov}
36080a9895SYuri Pankov
37080a9895SYuri Pankovmy $UNIDATA = undef;
38080a9895SYuri Pankov
39080a9895SYuri Pankovmy $result = GetOptions (
40080a9895SYuri Pankov		"unidata=s"	=> \$UNIDATA
41080a9895SYuri Pankov	    );
42080a9895SYuri Pankov
43080a9895SYuri Pankovmy %utf8map = ();
44080a9895SYuri Pankovmy $outfilename = "data/common.UTF-8.src";
45080a9895SYuri Pankov
46080a9895SYuri Pankovget_utf8map("data/UTF-8.cm");
47080a9895SYuri Pankovgenerate_header ();
48080a9895SYuri Pankovparse_unidata ("$UNIDATA");
49080a9895SYuri Pankovgenerate_footer ();
50080a9895SYuri Pankov
51080a9895SYuri Pankov############################
52080a9895SYuri Pankov
53080a9895SYuri Pankovsub get_utf8map {
54080a9895SYuri Pankov	my $file = shift;
55080a9895SYuri Pankov
56080a9895SYuri Pankov	open(FIN, $file);
57080a9895SYuri Pankov	my @lines = <FIN>;
58080a9895SYuri Pankov	close(FIN);
59080a9895SYuri Pankov	chomp(@lines);
60080a9895SYuri Pankov
61080a9895SYuri Pankov	my $incharmap = 0;
62080a9895SYuri Pankov	foreach my $l (@lines) {
63080a9895SYuri Pankov		$l =~ s/\r//;
64080a9895SYuri Pankov		next if ($l =~ /^\#/);
65080a9895SYuri Pankov		next if ($l eq "");
66080a9895SYuri Pankov
67080a9895SYuri Pankov		if ($l eq "CHARMAP") {
68080a9895SYuri Pankov			$incharmap = 1;
69080a9895SYuri Pankov			next;
70080a9895SYuri Pankov		}
71080a9895SYuri Pankov
72080a9895SYuri Pankov		next if (!$incharmap);
73080a9895SYuri Pankov		last if ($l eq "END CHARMAP");
74080a9895SYuri Pankov
75080a9895SYuri Pankov		$l =~ /^(<[^\s]+>)\s+(.*)/;
76080a9895SYuri Pankov		my $k = $2;
77080a9895SYuri Pankov		my $v = $1;
78080a9895SYuri Pankov		$k =~ s/\\x//g;		# UTF-8 char code
79080a9895SYuri Pankov		$utf8map{$k} = $v;
80080a9895SYuri Pankov	}
81080a9895SYuri Pankov}
82080a9895SYuri Pankov
83080a9895SYuri Pankovsub generate_header {
84080a9895SYuri Pankov	open(FOUT, ">", "$outfilename")
85080a9895SYuri Pankov		or die ("can't write to $outfilename\n");
86080a9895SYuri Pankov	print FOUT "LC_CTYPE\n\n";
87080a9895SYuri Pankov}
88080a9895SYuri Pankov
89080a9895SYuri Pankovsub generate_footer {
90080a9895SYuri Pankov	print FOUT "\nEND LC_CTYPE\n";
91080a9895SYuri Pankov	close (FOUT);
92080a9895SYuri Pankov}
93080a9895SYuri Pankov
94080a9895SYuri Pankovsub wctomb {
95080a9895SYuri Pankov	my $wc = hex(shift);
96080a9895SYuri Pankov	my $lead;
97080a9895SYuri Pankov	my $len;
98080a9895SYuri Pankov	my $ret = "";
99080a9895SYuri Pankov	my $i;
100080a9895SYuri Pankov
101080a9895SYuri Pankov	if (($wc & ~0x7f) == 0) {
102080a9895SYuri Pankov		return sprintf "%02X", $wc;
103080a9895SYuri Pankov	} elsif (($wc & ~0x7ff) == 0) {
104080a9895SYuri Pankov		$lead = 0xc0;
105080a9895SYuri Pankov		$len = 2;
106080a9895SYuri Pankov	} elsif (($wc & ~0xffff) == 0) {
107080a9895SYuri Pankov		$lead = 0xe0;
108080a9895SYuri Pankov		$len = 3;
109080a9895SYuri Pankov	} elsif ($wc >= 0 && $wc <= 0x10ffff) {
110080a9895SYuri Pankov		$lead = 0xf0;
111080a9895SYuri Pankov		$len = 4;
112080a9895SYuri Pankov	}
113080a9895SYuri Pankov
114080a9895SYuri Pankov	for ($i = $len - 1; $i > 0; $i--) {
115080a9895SYuri Pankov		$ret = (sprintf "%02X", ($wc & 0x3f) | 0x80) . $ret;
116080a9895SYuri Pankov		$wc >>= 6;
117080a9895SYuri Pankov	}
118080a9895SYuri Pankov	$ret = (sprintf "%02X", ($wc & 0xff) | $lead) . $ret;
119080a9895SYuri Pankov
120080a9895SYuri Pankov	return $ret;
121080a9895SYuri Pankov}
122080a9895SYuri Pankov
123080a9895SYuri Pankovsub parse_unidata {
124080a9895SYuri Pankov	my $file = shift;
125080a9895SYuri Pankov	my %data = ();
126080a9895SYuri Pankov
127080a9895SYuri Pankov	open(FIN, $file);
128080a9895SYuri Pankov	my @lines = <FIN>;
129080a9895SYuri Pankov	close(FIN);
130080a9895SYuri Pankov	chomp(@lines);
131080a9895SYuri Pankov
132080a9895SYuri Pankov	foreach my $l (@lines) {
133080a9895SYuri Pankov		my @d = split(/;/, $l, -1);
134080a9895SYuri Pankov		my $mb = wctomb($d[0]);
135080a9895SYuri Pankov		my $cat;
136080a9895SYuri Pankov
137080a9895SYuri Pankov		# XXX There are code points present in UnicodeData.txt
138080a9895SYuri Pankov		# and missing from UTF-8.cm
139080a9895SYuri Pankov		next if !defined $utf8map{$mb};
140080a9895SYuri Pankov
141080a9895SYuri Pankov		# Define the category
142080a9895SYuri Pankov		if ($d[2] =~ /^Lu/) {
143080a9895SYuri Pankov			$cat = "upper";
144080a9895SYuri Pankov		} elsif ($d[2] =~ /^Ll/) {
145080a9895SYuri Pankov			$cat = "lower";
146080a9895SYuri Pankov		} elsif ($d[2] =~ /^Nd/) {
147080a9895SYuri Pankov			$cat = "digit";
148080a9895SYuri Pankov		} elsif ($d[2] =~ /^L/) {
149080a9895SYuri Pankov			$cat = "alpha";
150080a9895SYuri Pankov		} elsif ($d[2] =~ /^P/) {
151080a9895SYuri Pankov			$cat = "punct";
152*3052595aSYuri Pankov		} elsif ($d[2] =~ /^Co/ || $d[2] =~ /^M/ || $d[2] =~ /^N/ ||
153*3052595aSYuri Pankov		    $d[2] =~ /^S/) {
154080a9895SYuri Pankov			$cat = "graph";
155080a9895SYuri Pankov		} elsif ($d[2] =~ /^C/) {
156080a9895SYuri Pankov			$cat = "cntrl";
157080a9895SYuri Pankov		} elsif ($d[2] =~ /^Z/) {
158080a9895SYuri Pankov			$cat = "space";
159080a9895SYuri Pankov		}
160080a9895SYuri Pankov		$data{$cat}{$mb}{'wc'} = $d[0];
161080a9895SYuri Pankov
162080a9895SYuri Pankov		# Check if it's a start or end of range
163080a9895SYuri Pankov		if ($d[1] =~ /First>$/) {
164080a9895SYuri Pankov			$data{$cat}{$mb}{'start'} = 1;
165080a9895SYuri Pankov		} elsif ($d[1] =~ /Last>$/) {
166080a9895SYuri Pankov			$data{$cat}{$mb}{'end'} = 1;
167080a9895SYuri Pankov		}
168080a9895SYuri Pankov
169080a9895SYuri Pankov		# Check if there's upper/lower mapping
170080a9895SYuri Pankov		if ($d[12] ne "") {
171080a9895SYuri Pankov			$data{'toupper'}{$mb} = wctomb($d[12]);
172080a9895SYuri Pankov		} elsif ($d[13] ne "") {
173080a9895SYuri Pankov			$data{'tolower'}{$mb} = wctomb($d[13]);
174080a9895SYuri Pankov		}
175080a9895SYuri Pankov	}
176080a9895SYuri Pankov
177080a9895SYuri Pankov	my $first;
178080a9895SYuri Pankov	my $inrange = 0;
179080a9895SYuri Pankov
180080a9895SYuri Pankov	# Now write out the categories
181080a9895SYuri Pankov	foreach my $cat (sort keys (%data)) {
182080a9895SYuri Pankov		print FOUT "$cat\t";
183080a9895SYuri Pankov		$first = 1;
184080a9895SYuri Pankov	foreach my $mb (sort keys (%{$data{$cat}})) {
185080a9895SYuri Pankov		if ($first == 1) {
186080a9895SYuri Pankov			$first = 0;
187080a9895SYuri Pankov		} elsif ($inrange == 1) {
188080a9895SYuri Pankov			# Safety belt
189080a9895SYuri Pankov			die "broken range end wc=$data{$cat}{$mb}{'wc'}"
190080a9895SYuri Pankov			    if !defined $data{$cat}{$mb}{'end'};
191080a9895SYuri Pankov			print FOUT ";...;";
192080a9895SYuri Pankov			$inrange = 0;
193080a9895SYuri Pankov		} else {
194080a9895SYuri Pankov			print FOUT ";/\n\t";
195080a9895SYuri Pankov		}
196080a9895SYuri Pankov
197080a9895SYuri Pankov		if ($cat eq "tolower" || $cat eq "toupper") {
198080a9895SYuri Pankov			print FOUT "($utf8map{$mb},$utf8map{$data{$cat}{$mb}})";
199080a9895SYuri Pankov		} else {
200080a9895SYuri Pankov			if (defined($data{$cat}{$mb}{'start'})) {
201080a9895SYuri Pankov				$inrange = 1;
202080a9895SYuri Pankov			}
203080a9895SYuri Pankov			print FOUT "$utf8map{$mb}";
204080a9895SYuri Pankov		}
205080a9895SYuri Pankov	}
206080a9895SYuri Pankov		print FOUT "\n";
207080a9895SYuri Pankov	}
208080a9895SYuri Pankov}
209