<?xml version="1.0" encoding="UTF-8" ?> <!DOCTYPE supplementalData SYSTEM "../../common/dtd/ldmlSupplemental.dtd"> <!-- Copyright © 1991-2013 Unicode, Inc. CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/) For terms of use, see http://www.unicode.org/copyright.html --> <supplementalData> <version number="$Revision: 12137 $"/> <transforms> <transform source="Han" target="Spacedhan" direction="both" visibility="internal"> <tRule> # Only intended for internal use # Make sure Han are normalized, including characters that contain them. # The first set in the filter is computed with http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:tonfkd:/XXX/:]-[:ideographic:]-[:sc=han:] # Where XXX is the resolved [:ideographic:][:sc=han:]. It needs updating with each Unicode release! :: [[㆒-㆟㈠-㉇㊀-㊰㋀-㋋㍘-㍰㍻-㍿㏠-㏾ 🈐-🈒🈔-🈺🉀-🉈🉐🉑][:ideographic:][:sc=han:]] nfkc; :: fullwidth-halfwidth; 。 → '.'; $terminalPunct = [\.\,\:\;\?\!.,:?!。、;[:Pe:][:Pf:]]; $initialPunct = [:Ps:][:Pi:]; # add space between any Han or terminal punctuation and letters, and # between letters and Han or initial punct [[:Ideographic:] $terminalPunct] {} [:Letter:] → ' ' ; [:Letter:] [:Mark:]* {} [[:Ideographic:] $initialPunct] → ' ' ; # remove spacing between ideographs and other letters ← [:Ideographic:] { ' ' } [:Letter:] ; ← [:Letter:] [:Mark:]* { ' ' } [:Ideographic:] ; </tRule> </transform> </transforms> </supplementalData>