Friday, 20 April 2018

从 ls 排序问题开始探索


网上有人问为什么ls ascii 排序, > 在前, - 在后, 这不科学啊。

首先, type -a ls 知道 ls 完整路径是 /bin/ls,然后(假设系统是 debian-based)  dpkg -S /bin/ls 知道 ls 属于 coreutils package。最后 apt source coreutils  下载源代码。

f
下载完毕后,find . -name '*ls*.c' 查找到 ls.c 路径, 然后看见:

/* Read directory NAME, and list the files in it.
   If REALNAME is nonzero, print its name instead of NAME;
   this is used for symbolic links to directories.
   COMMAND_LINE_ARG means this directory was mentioned on the command line.  */

static void
print_dir (char const *name, char const *realname, bool command_line_arg)
{
  DIR *dirp;
  struct dirent *next;
  uintmax_t total_blocks = 0;
  static bool first = true;

  errno = 0;
  dirp = opendir (name);
  if (!dirp)
    {
      file_failure (command_line_arg, _("cannot open directory %s"), name);
      return;
    }

  if (LOOP_DETECT)
    {
      struct stat dir_stat;
      int fd = dirfd (dirp);

      /* If dirfd failed, endure the overhead of using stat.  */
      if ((0 <= fd
           ? fstat (fd, &dir_stat)
           : stat (name, &dir_stat)) < 0)

可以看见 opendir  函数拿到目录 pointer dirp  后,把 pointer  传入 dirfd 函数获得目录 fd ... 省略。  再往后看见  dirp 传入 readdir 循环获取目录全部文件:

/* Read the directorCy entries, and insert the subfiles into the 'cwd_file'
     table.  */

  while (1)
    {
      /* Set errno to zero so we can distinguish between a readdir failure
         and when readdir simply finds that there are no more entries.  */
      errno = 0;
      next = readdir (dirp);

简单的程序来模拟 :

/*
 * This program displays the names of all files in the current directory.
 */

#include <dirent.h>
#include <stdio.h>
#include <locale.h>

int main(void) {
  DIR *d;
  struct dirent *dir;
    setlocale (LC_ALL, "");

  d = opendir(".");
  if (d) {
    while ((dir = readdir(d)) != NULL) {
      printf("%s\n", dir->d_name);
    }
    closedir(d);
  }
  return(0);
}

编译运行后可以发现文件是没有意义的随机。网上有人如此解释:

    The entries are probably returned in whatever order the implementor figured would be the fasted order to return them.

    Traditionally Unix filesystems store a list files and directories in an unsorted list.  Think of it as an array.  The fasted way to return the items is to just loop over the array.

    A fast way to insert an item into the array is to insert in the next unused slot.  Suppose the filesystem does not keep an index that points to the next free slot so the system just loops over the array until it finds a free slot.

    A fast way to remove an item is to loop over the array to find the item and then just mark that slot as unused.  The system could sort the array when an item is removed but that takes time so it likely just leaves an open slot wherever an item is removed.

    There are filesystems (like ReiserFS) that use trees for indexing to give faster searching.  In the end the order that items are returned by readdir is not defined to be sorted in any particular way so it's up to the application to sort the items as required.


可以总结: 从系统获得文件名时是并非默认排序好的,而是 ls 程序较后做了 sort 的动作。

ls.c 默认排序是名字 sort_name:

/* The file characteristic to sort by.  Controlled by -t, -S, -U, -X, -v.
   The values of each item of this enum are important since they are
   used as indices in the sort functions array (see sort_files()).  */

enum sort_type
  {
    sort_none = -1,     /* -U */
    sort_name,          /* default */
    sort_extension,     /* -X */
    sort_size,          /* -S */
    sort_version,       /* -v */
    sort_time,          /* -t */
    sort_numtypes       /* the number of elements of this enum */
  };


ls.c 的 sort_files 会把 sort_functions 比较函数传入 mpsort 函数:

/* Sort the files now in the table.  */

static void
sort_files (void)
{
  bool use_strcmp;

  if (sorted_file_alloc < cwd_n_used + cwd_n_used / 2)
    {
      free (sorted_file);
      sorted_file = xnmalloc (cwd_n_used, 3 * sizeof *sorted_file);
      sorted_file_alloc = 3 * cwd_n_used;
    }

  initialize_ordering_vector ();

  if (sort_type == sort_none)
    return;

  /* Try strcoll.  If it fails, fall back on strcmp.  We can't safely
     ignore strcoll failures, as a failing strcoll might be a
     comparison function that is not a total order, and if we ignored
     the failure this might cause qsort to dump core.  */

  if (! setjmp (failed_strcoll))
    use_strcmp = false;      /* strcoll() succeeded */
  else
    {
      use_strcmp = true;
      assert (sort_type != sort_version);
      initialize_ordering_vector ();
    }

  /* When sort_type == sort_time, use time_type as subindex.  */
  mpsort ((void const **) sorted_file, cwd_n_used,
          sort_functions[sort_type + (sort_type == sort_time ? time_type : 0)]
                        [use_strcmp][sort_reverse]
                        [directories_first]);

sort_function 的定义包括了 xstrcoll 和 strcmp 两大比较函数(以及 rev 版本), 用哪一个取决于上面的 use_strcmp :

/* Define the 8 different sort function variants required for each sortkey.
   KEY_NAME is a token describing the sort key, e.g., ctime, atime, size.
   KEY_CMP_FUNC is a function to compare records based on that key, e.g.,
   ctime_cmp, atime_cmp, size_cmp.  Append KEY_NAME to the string,
   '[rev_][x]str{cmp|coll}[_df]_', to create each function name.  */
#define DEFINE_SORT_FUNCTIONS(key_name, key_cmp_func)           \
  /* direct, non-dirfirst versions */                   \
  static int xstrcoll_##key_name (V a, V b)             \
  { return key_cmp_func (a, b, xstrcoll); }             \
  static int strcmp_##key_name (V a, V b)               \
  { return key_cmp_func (a, b, strcmp); }               \


再看 xstrcoll 是神马:

/* Use strcoll to compare strings in this locale.  If an error occurs,
   report an error and longjmp to failed_strcoll.  */

static jmp_buf failed_strcoll;

static int
xstrcoll (char const *a, char const *b)
{
  int diff;
  errno = 0;
  diff = strcoll (a, b);
  if (errno)
    {
      error (0, errno, _("cannot compare file names %s and %s"),
             quote_n (0, a), quote_n (1, b));
      set_exit_status (false);
      longjmp (failed_strcoll, 1);
    }
  return diff;
}



已经非常清楚是调用了 strcoll 这个 C 标准库函数 ("coll" 顾名思义联想到 LC_COLLATE)。 而注释 Use strcoll to compare strings in this locale. 也解释了 strcoll 依赖于 locale。


口说无凭,让我们调用代码:

#include <stddef.h>
#include <locale.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
void test(char * a, char * b);

int main(int argc, const char * argv[])
{
    test("a", "c");
    test("1", "3");
    test("1", "11");
    test("a", "A");
    test("-", ">");
    test("+", "a");
    test("+", "{");
    return 0;
}

void test(char * a, char * b) {
    int result;
    setlocale (LC_ALL, ""); //环境变量 LC_ALL 一旦设空,即会使用单独的 LC_COLLATE
    char str1[140];
    char str2[140];
    strcpy(str1, a);
    strcpy(str2, b);
    result = strcoll(str1, str2);
    if (result == 0) printf("Strings are the same\n");
    if (result > 0) printf("%s is greater than the %s\n", a, b);
    else printf("%s is less than the %s\n", a, b);
}

编译后输出结果, 证明确实是 strcoll 被 locale 影响:

xb@dnxb:/tmp$ ./a.out
a is less than the c
1 is less than the 3
1 is less than the 11
a is less than the A
- is greater than the >
+ is less than the a
+ is greater than the {
xb@dnxb:/tmp$ LC_COLLATE=C ./a.out
a is less than the c
1 is less than the 3
1 is less than the 11
a is greater than the A
- is less than the >
+ is less than the a
+ is less than the {
xb@dnxb:/tmp$


因此只需要更改 locale 环境变量,即可影响 ls 的输出:

xb@dnxb:/tmp/test/sub$ locale |grep -E 'LANG|COLL' #当前系统 locale
LANG=en_US.UTF-8
LANGUAGE=
LC_COLLATE="en_US.UTF-8"
xb@dnxb:/tmp/test/sub$ ls
`  ×  <  =  >  -  ,  ;  :  !  ?  ¿  "  «  ]  {  \  #  1  2  a  b  m
xb@dnxb:/tmp/test/sub$ LC_COLLATE=en_US.UTF-8 ls
`  ×  <  =  >  -  ,  ;  :  !  ?  ¿  "  «  ]  {  \  #  1  2  a  b  m
xb@dnxb:/tmp/test/sub$ LC_COLLATE=C ls
!  "  #  ,  -  1  2  :  ;  <  =  >  ?  \  ]  `  a  b  m  {  «  ¿  ×
xb@dnxb:/tmp/test/sub$ LC_COLLATE=C.UTF-8 ls
!  "  #  ,  -  1  2  :  ;  <  =  >  ?  \  ]  `  a  b  m  {  «  ¿  ×
xb@dnxb:/tmp/test/sub$


可以看出 LC_COLLATE=C 就是你想要的纯 ascii 代码排序:

xb@dnxb:/tmp/test/sub$ LC_COLLATE=C ls | hexdump -C
00000000  21 0a 22 0a 23 0a 2c 0a  2d 0a 31 0a 32 0a 3a 0a  |!.".#.,.-.1.2.:.|
00000010  3b 0a 3c 0a 3d 0a 3e 0a  3f 0a 5c 0a 5d 0a 60 0a  |;.<.=.>.?.\.].`.|
00000020  61 0a 62 0a 6d 0a 7b 0a  c2 ab 0a c2 bf 0a c3 97  |a.b.m.{.........|
00000030  0a                                                |.|
00000031
xb@dnxb:/tmp/test/sub$


至于 LC_COLLATE=C.UTF-8 的好处是支持中文输出, 而不是 ? 乱码。

继续深入:

xb@dnxb:/tmp$ localedef --help | tail -n 5
System's directory for character maps : /usr/share/i18n/charmaps
                       repertoire maps: /usr/share/i18n/repertoiremaps
                       locale path    : /usr/lib/locale:/usr/share/i18n
For bug reporting instructions, please see:
... 和谐链接
xb@dnxb:/tmp$ ls /usr/share/i18n
charmaps  locales  SUPPORTED
xb@dnxb:/tmp$ ls /usr/lib/locale
C.UTF-8  locale-archive
xb@dnxb:/tmp$


看到 locale 相关的目录 i18n, 搜索 LC_COLLATE 能发现 "iso14651_t1" 关键字 。

xb@dnxb:/tmp$ grep 'END LC_COLL' /usr/share/i18n/locales/en_US  -B 5
LC_COLLATE

% Copy the template from ISO/IEC 14651
copy "iso14651_t1"

END LC_COLLATE
xb@dnxb:/tmp$


随便网搜到 ISO 14651 的维基百科就知道要去看 iso 文档,我穷小子没钱买只能下载。

边看 iso 文档了解, 继续搜:

xb@dnxb:/tmp$ find  /usr/share/i18n/ -name '*iso14651*'
/usr/share/i18n/locales/iso14651_t1_pinyin
/usr/share/i18n/locales/iso14651_t1_common
/usr/share/i18n/locales/iso14651_t1
xb@dnxb:/tmp$


打开 iso14651_t1_common 文件搜索 } 就能看见梦寐以求的 table 啦:

 order_start <SPECIAL>;forward;backward;forward;forward,position
#
# Tout caractère non précisément défini sera considéré comme caractère spécial
# et considéré uniquement au dernier niveau.
#
# Any character not precisely specified will be considered as a special
# character and considered only at the last level.
# <U0000>......<U7FFFFFFF> IGNORE;IGNORE;IGNORE;<U0000>......<U7FFFFFFF>
#
# SYMB.                                N° GLY
#
<U0020> IGNORE;IGNORE;IGNORE;<U0020> # 32 <SP>
<U005F> IGNORE;IGNORE;IGNORE;<U005F> # 33 _
<U0332> IGNORE;IGNORE;IGNORE;<U0332> # 34 <"_>
<U00AF> IGNORE;IGNORE;IGNORE;<U00AF> # 35 - (MACRON)
<U00AD> IGNORE;IGNORE;IGNORE;<U00AD> # 36 <SHY>
<U002D> IGNORE;IGNORE;IGNORE;<U002D> # 37 -
<U002C> IGNORE;IGNORE;IGNORE;<U002C> # 38 ,
<U003B> IGNORE;IGNORE;IGNORE;<U003B> # 39 ;
<U003A> IGNORE;IGNORE;IGNORE;<U003A> # 40 :
<U0021> IGNORE;IGNORE;IGNORE;<U0021> # 41 !
<U00A1> IGNORE;IGNORE;IGNORE;<U00A1> # 42 ¡
<U003F> IGNORE;IGNORE;IGNORE;<U003F> # 43 ?
<U00BF> IGNORE;IGNORE;IGNORE;<U00BF> # 44 ¿
<U002F> IGNORE;IGNORE;IGNORE;<U002F> # 45 /
<U0338> IGNORE;IGNORE;IGNORE;<U0338> # 46 <"/>
<U002E> IGNORE;IGNORE;IGNORE;<U002E> # 47 .
<U00B7> IGNORE;IGNORE;IGNORE;<U00B7> # 58 ×
<U00B8> IGNORE;IGNORE;IGNORE;<U00B8> # 59 ¸
<U0328> IGNORE;IGNORE;IGNORE;<U0328> # 60 <";>
<U0027> IGNORE;IGNORE;IGNORE;<U0027> # 61 '
<U2018> IGNORE;IGNORE;IGNORE;<U2018> # 62 <'6>
<U2019> IGNORE;IGNORE;IGNORE;<U2019> # 63 <'9>
<U0022> IGNORE;IGNORE;IGNORE;<U0022> # 64 "
<U201C> IGNORE;IGNORE;IGNORE;<U201C> # 65 <"6>
<U201D> IGNORE;IGNORE;IGNORE;<U201D> # 66 <"9>
<U00AB> IGNORE;IGNORE;IGNORE;<U00AB> # 67 «
<U00BB> IGNORE;IGNORE;IGNORE;<U00BB> # 68 »
<U0028> IGNORE;IGNORE;IGNORE;<U0028> # 69 (
<U207D> IGNORE;IGNORE;IGNORE;<U207d> # 70 <(S>
<U0029> IGNORE;IGNORE;IGNORE;<U0029> # 71 )
<U207E> IGNORE;IGNORE;IGNORE;<U207E> # 72 <)S>
<U005B> IGNORE;IGNORE;IGNORE;<U005B> # 73 [
<U005D> IGNORE;IGNORE;IGNORE;<U005D> # 74 ]
<U007B> IGNORE;IGNORE;IGNORE;<U007B> # 75 {
<U007D> IGNORE;IGNORE;IGNORE;<U007D> # 76 }
<U00A7> IGNORE;IGNORE;IGNORE;<U00A7> # 77 §
<U00B6> IGNORE;IGNORE;IGNORE;<U00B6> # 78 ¶
<U00A9> IGNORE;IGNORE;IGNORE;<U00A9> # 79 ©
<U00AE> IGNORE;IGNORE;IGNORE;<U00AE> # 80 ®
<U2122> IGNORE;IGNORE;IGNORE;<U2122> # 81 <TM>
<U0040> IGNORE;IGNORE;IGNORE;<U0040> # 82 @
<U00A4> IGNORE;IGNORE;IGNORE;<U00A4> # 83 ¤
<U00A2> IGNORE;IGNORE;IGNORE;<U00A2> # 84 ¢
<U0024> IGNORE;IGNORE;IGNORE;<U0024> # 85 $
<U00A3> IGNORE;IGNORE;IGNORE;<U00A3> # 86 £
<U00A5> IGNORE;IGNORE;IGNORE;<U00A5> # 87 ¥
<U20A0> IGNORE;IGNORE;IGNORE;<U20A0> # ecu
... 省略
<U20AF> IGNORE;IGNORE;IGNORE;<U20AF> # drachma
<U002A> IGNORE;IGNORE;IGNORE;<U002A> # 88 *
<U005C> IGNORE;IGNORE;IGNORE;<U005C> # 89
<U0026> IGNORE;IGNORE;IGNORE;<U0026> # 90 &
<U0023> IGNORE;IGNORE;IGNORE;<U0023> # 91 #
<U0025> IGNORE;IGNORE;IGNORE;<U0025> # 92 %
<U207B> IGNORE;IGNORE;IGNORE;<U207D> # 93 <-S>
<U002B> IGNORE;IGNORE;IGNORE;<U002B> # 94 +
<U207A> IGNORE;IGNORE;IGNORE;<U207E> # 95 <+S>
<U00B1> IGNORE;IGNORE;IGNORE;<U00B1> # 96 ±
<U00B4> IGNORE;IGNORE;IGNORE;<0> # 123 ´
<U0060> IGNORE;IGNORE;IGNORE;<1> # 124 `
<U0306> IGNORE;IGNORE;IGNORE;<2> # 125 <"(>
<U005E> IGNORE;IGNORE;IGNORE;<3> # 126 ^
<U030C> IGNORE;IGNORE;IGNORE;<4> # 127 <"<>
<U030A> IGNORE;IGNORE;IGNORE;<5> # 128 <"0>
<U00A8> IGNORE;IGNORE;IGNORE;<6> # 129 ¨
<U030B> IGNORE;IGNORE;IGNORE;<7> # 130 <"">
<U007E> IGNORE;IGNORE;IGNORE;<8> # 131 ~
<U0307> IGNORE;IGNORE;IGNORE;<9> # 132 <".>
<U00F7> IGNORE;IGNORE;IGNORE;<a> # 133 ¸
<U00D7> IGNORE;IGNORE;IGNORE;<b> # 134 ´
<U2260> IGNORE;IGNORE;IGNORE;<c> # 135 <!=>
<U003C> IGNORE;IGNORE;IGNORE;<d> # 136 <
<U2264> IGNORE;IGNORE;IGNORE;<e> # 137 <=<>
<U003D> IGNORE;IGNORE;IGNORE;<f> # 138 =
<U2265> IGNORE;IGNORE;IGNORE;<g> # 139 </>=>
<U003E> IGNORE;IGNORE;IGNORE;<h> # 140 >
<U00AC> IGNORE;IGNORE;IGNORE;<i> # 141 ¬
<U007C> IGNORE;IGNORE;IGNORE;<j> # 142 |
<U00A6> IGNORE;IGNORE;IGNORE;<k> # 143 |
<U00B0> IGNORE;IGNORE;IGNORE;<l> # 144 °
<U00B5> IGNORE;IGNORE;IGNORE;<m> # 145 m
<U2126> IGNORE;IGNORE;IGNORE;<n> # 146 <Om>
... 省略


回顾:

xb@dnxb:/tmp/test/sub$ LC_COLLATE=en_US.UTF-8 ls
`  ×  <  =  >  -  ,  ;  :  !  ?  ¿  "  «  ]  {  \  #  1  2  a  b  m
xb@dnxb:/tmp/test/sub$


你可以发现它第一组是 <U0020> IGNORE;IGNORE;IGNORE;<U0020> # 32 <SP> 至 <U00B4> IGNORE;IGNORE;IGNORE;<0> # 123 ´ , 之后是第二组。第二组轻于第一组。第二组 ` × < = > 除了奇怪的 × 外,其它都是跟 en_US.UTF-8 的 ls 排序吻合 (我猜是 weight 越少越轻,所以往上排序)。之后第一组的 - , ; : ! ? ¿ " « ] { \ # ( `\` 是 005c,# 看不见),全和 en_US.UTF-8 的 ls 排序吻合。

文档说的 forward;backward 有点生涩, 所以我也没完全理解排序原理。可以S/O搜其它的解释, 自己理解:
Take a look at how a and A are ordered based on their entries in iso14651_t1_common:
<U0061> <a>;<BAS>;<MIN>;IGNORE # 198 a
<U0041> <a>;<BAS>;<CAP>;IGNORE # 517 A

b and B are similar:
<U0062> <b>;<BAS>;<MIN>;IGNORE # 233 b
<U0042> <b>;<BAS>;<CAP>;IGNORE # 550 B

We see that on the first pass, both a and A have the collating symbol <a>, while both b and B have the collating symbol <b>. Since <a> appears before <b> in iso14651_t1_common, a and A are tied before b and B. The second pass doesn't break the ties because all four characters have the collating symbol <BAS>, but during the third pass the ties are resolved because the collating symbol for lowercase letters <MIN> appears on line 3467, before the collating symbol for uppercase letters <CAP> (line 3488). So the sort order ends up as a, A, b, B.
Swapping the first and third collating symbols would sort letters first by case (lower then upper), then by accent (<BAS> means non-accented), then by alphabetical order. However, both <MIN> and <CAP> come before the numeric digits, so this would have the unwanted effect of putting digits after letters.

The easiest way to keep digits first while making all lowercase letters come before all uppercase letters is to force all letters to tie during the first comparison by setting them all equal to <a>. To make sure that they sort alphabetically within case, change the last collating symbol from IGNORE to the current first collating symbol. Following this pattern, a would become:
<U0061> <a>;<BAS>;<MIN>;<a> # 198 a

A would become:
<U0041> <a>;<BAS>;<CAP>;<a> # 517 A

b would become:
<U0062> <a>;<BAS>;<MIN>;<b> # 233 b

B would become:
<U0042> <a>;<BAS>;<CAP>;<b> # 550 B
and so on for the rest of the letters.


文档有解释 weight  不过也是生涩 ,  ibm 有更适合我这种小白理解的介绍:

    Each single-byte character in a database is represented internally as a unique number between 0 and 255 (in hexadecimal  notation, between X'00' and X'FF'). This number is referred to as the code point of the character; the assignment of numbers to  characters in a set is collectively called a code page. A collating sequence is a mapping between the code point and the desired position of each character in a sorted sequence.  The numeric value of the position is called the weight of the character in the collating  sequence.  In the simplest collating sequence, the weights are identical to the code points.  This is called the identity sequence.

    For example, suppose the characters B and b have the code points X'42' and X'62',
    respectively.  If (according to the collating sequence table) they both have a sort weight of X'42' (B), they collate the same.  If the sort weight for B is X'9E', and the sort weight for b is X'9D', b will be sorted before B.  The collating sequence table specifies the weight of each character.  The table is different from a code page, which specifies the code point of each character. Consider the following example.  The ASCII characters A through Z are represented by X'41' through X'5A'.  To describe a collating sequence in which these characters are sorted consecutively (no intervening characters), you can write: X'41', X'42',
    … X'59', X'5A'.

    The hexadecimal value of a multibyte character is also used as the weight. For example, suppose the code points for the double-byte characters A and B are X'8260' and X'8261'  respectively, then the collation weights for X'82', X'60', and X'61' are used to sort these two  characters according to their code points. The weights in a collating sequence need not be unique.  For example, you could give uppercase letters and their lowercase equivalents the
    same weight.


至于为什么排序要 locale, iso 文档提出了一些问题例子, 如文化/口音的不同,排序也要求不同:

...
 Sorted  Internal
       List    Values
       Aaaa    01010101
       abbb    01030303
       Aaaa    02010101
       Abbb    02030303
This is also predictable, but remains obviously incorrect for any country with regard to cultural

最后,让我们自制 locales。甭管 C 或 en_US.UTF-8, {} 都是重过 [], 现在我们把它们俩颠倒:

xb@dnxb:~$ mkdir ~/.xiaobai_locale
xb@dnxb:~$ cd ~/.xiaobai_locale
xb@dnxb:~/.xiaobai_locale$ cp /usr/share/i18n/locales/en_US  ~/.xiaobai_locale/
xb@dnxb:~/.xiaobai_locale$ cp /usr/share/i18n/locales/iso14651_t1*  ~/.xiaobai_locale/
xb@dnxb:~/.xiaobai_locale$ grep  '# 73 \[' iso14651_t1_common -A 4
<U005B> IGNORE;IGNORE;IGNORE;<U005B> # 73 [
<U005D> IGNORE;IGNORE;IGNORE;<U005D> # 74 ]
<U007B> IGNORE;IGNORE;IGNORE;<U007B> # 75 {
<U007D> IGNORE;IGNORE;IGNORE;<U007D> # 76 }
<U00A7> IGNORE;IGNORE;IGNORE;<U00A7> # 77 §
xb@dnxb:~/.xiaobai_locale$ touch '[' ']' '{' '}'
xb@dnxb:~/.xiaobai_locale$ ls
[  ]  {  }  en_US  iso14651_t1  iso14651_t1_common  iso14651_t1_pinyin
xb@dnxb:~/.xiaobai_locale$ vimx iso14651_t1_common #把 { 和 } 拉上
xb@dnxb:~/.xiaobai_locale$ grep  '# 75 {' iso14651_t1_common -A 4
<U007B> IGNORE;IGNORE;IGNORE;<U007B> # 75 {
<U007D> IGNORE;IGNORE;IGNORE;<U007D> # 76 }
<U005B> IGNORE;IGNORE;IGNORE;<U005B> # 73 [
<U005D> IGNORE;IGNORE;IGNORE;<U005D> # 74 ]
<U00A7> IGNORE;IGNORE;IGNORE;<U00A7> # 77 §
xb@dnxb:~/.xiaobai_locale$ localedef -i en_US -f UTF-8 -vc $HOME/.xiaobai_locale/en_HELLO.UTF-8
en_US:15: non-symbolic character value should not be used
... 省略
LC_CTYPE: table for width: 0 bytes
xb@dnxb:~/.xiaobai_locale$ ls
[  ]  {  }  en_HELLO.UTF-8  en_US  iso14651_t1  iso14651_t1_common  iso14651_t1_pinyin
xb@dnxb:~/.xiaobai_locale$ LOCPATH=$HOME/.xiaobai_locale LC_ALL=en_HELLO.UTF-8 ls
{  }  [  ]  en_HELLO.UTF-8  en_US  iso14651_t1  iso14651_t1_common  iso14651_t1_pinyin
xb@dnxb:~/.xiaobai_locale$

{} 变成 [] 的左边,实验成功 :)


再来把 '黄' 和 '晃' 颠倒。由于第一个 pass 的 '黄' <U9EC4> 重过 '晃' <U6643>, 所以把 '晃' 放在 '黄' 上面是无效的。可以把最后一个 pass 的 IGNORE 换成 <a> 和 <b>, 因为 <a> 轻过 <b>, 所以把 '黄' 改成 <a> 和 '晃' 改成 <b>,即可让 '晃' 重过 '黄', 不过'晃' 必须在 '黄' 下面 。至于 include template 有两种方法,一个是 copy, 另外一个是 script。由于 /usr/share/i18n/locales/iso14651_t1_pinyin 已有 HAN, 所以直接用 copy "iso5201314_pinyin" 以避免 HAN 命名冲突。我也跳过 iso14651_t1 中间代理, 直接用 en_HELLO include "iso5201314" 和 "iso5201314_pinyin":


xb@dnxb:~/.xiaobai_locale$ cp en_US en_HELLO

xb@dnxb:~/.xiaobai_locale$ grep iso en_HELLO -A 2 -B 1

% Copy the template from ISO/IEC 14651

copy "iso14651_t1"


END LC_COLLATE

xb@dnxb:~/.xiaobai_locale$ vimx en_HELLO

xb@dnxb:~/.xiaobai_locale$ grep iso en_HELLO -A 2 -B 1

% Copy the template from ISO/IEC 5201314

copy "iso5201314"

copy "iso5201314_pinyin"


END LC_COLLATE

xb@dnxb:~/.xiaobai_locale$ sed 's/14651/5201314/g' iso14651_t1_common > iso5201314

xb@dnxb:~/.xiaobai_locale$ sed 's/copy "iso14651_t1_common"//g' iso14651_t1_pinyin > iso5201314_pinyin

xb@dnxb:~/.xiaobai_locale$ touch '[' ']' '黄' '晃' '{' '}'

xb@dnxb:~/.xiaobai_locale$ ls '[' ']' '黄' '晃' '{' '}'

[  ]  {  }  晃  黄

xb@dnxb:~/.xiaobai_locale$ grep -En '晃|黄' iso5201314_pinyin

7523:<U9EC4> <U9EC4>;IGNORE;IGNORE;IGNORE       #黄20546

7535:<U6643> <U6643>;IGNORE;IGNORE;IGNORE       #晃2551

xb@dnxb:~/.xiaobai_locale$ vimx iso5201314_pinyin

xb@dnxb:~/.xiaobai_locale$ grep -En '晃|黄' iso5201314_pinyin

7523:<U9EC4> <U9EC4>;IGNORE;IGNORE;<a>  #黄20546

7535:<U6643> <U6643>;IGNORE;IGNORE;<b>  #晃2551

xb@dnxb:~/.xiaobai_locale$ sudo cp en_HELLO /usr/share/i18n/locales/

xb@dnxb:~/.xiaobai_locale$ sudo cp iso5201314* /usr/share/i18n/locales/

xb@dnxb:~/.xiaobai_locale$ sudo bash -c "echo 'en_HELLO.UTF-8 UTF-8' >> /etc/locale.gen"

xb@dnxb:~/.xiaobai_locale$ sudo locale-gen #更新 /usr/lib/locale/locale-archive

Generating locales (this might take a while)...

... 省略

  en_HELLO.UTF-8... done

Generation complete.

xb@dnxb:~/.xiaobai_locale$ sudo update-locale LANG=en_HELLO.UTF-8 #更改 locale 全局变量

xb@dnxb:~/.xiaobai_locale$ grep LANG /etc/default/locale

LANG=en_HELLO.UTF-8

xb@dnxb:~/.xiaobai_locale$ . /etc/default/locale #不用重启,现在测试

xb@dnxb:~/.xiaobai_locale$ ls '[' ']' '黄' '晃' '{' '}' #全部成功颠倒

{  }  [  ]  黄  晃

xb@dnxb:~/.xiaobai_locale$


最后的最后,把 "我爱你" 排序在所有字的最下方, 如:

<U8444> <U8444>;IGNORE;IGNORE;IGNORE    #葄0
<U888F> <U888F>;IGNORE;IGNORE;IGNORE    #袏0
<U963C> <U963C>;IGNORE;IGNORE;IGNORE    #阼0
<U3010> <U3010>;IGNORE;IGNORE;<d>   #【
<U6211> <U6211>;IGNORE;IGNORE;<a>   #我841127
<U7231> <U7231>;IGNORE;IGNORE;<b>   #爱60751
<U4F60> <U4F60>;IGNORE;IGNORE;<c>   #你313574
#
order_end
#
END LC_COLLATE


浪漫叻~


No comments:

Post a Comment