第121回プログラミングについて『作っておくと便利なちょっとしたプログラムその5』

　　　　　　　　　　プログラミングについて　第１２１回目

　　　　　　　『作っておくと便利なちょっとしたプログラムその５』

　今回も前回に続いてプログラムを作っていきましょう。
前回までのプログラムリストは長くなってきましたので、最後に完成したリストを掲載することにします。

　１５０万個のデータの処理が３０秒（ファイルからの読み込み時間も含みます）もかかるということについて、何が問題なのかを考えてみてもあまり気に掛かる部分はないのですが、あるとすれば２分木がどんなにバランスが良くても根から末端までの深さが１５程度になってしまうことです。ですのでデータの個数自体が１５０万個ですので文字列の比較回数も、２分木の整形の手間も馬鹿にならなくなるためと考えられます。ということは、２分木の深さが浅くなればこの問題は解消される可能性がるということです。
　どういう風に実現するかというと、たくさんの小さい２分木を作って最後に２分木を１つにまとめれば最終的に元と同じ２分木を作り出すという手法をとります。たくさんの２分木に分けるには以前お話ししたハッシュ法を利用します。２分木へのポインタの配列を作っておいて、文字列によってどの配列が示す２分木に追加するのかを分類するようにします。

まずユニークデータの構造体を次のように配列に変更します。

　#define MAX_UNIQUE_DATA　1000

　struct unique_data_strct{
　　char *text;
　　int　total;
　　struct unique_data_strct *left,*right;
　　char ldepth,rdepth;
　};
　struct unique_data_strct *unique_data[MAX_UNIQUE_DATA];

配列の個数は任意なのですが、あまり少ないと効果は大きくなりませんのでここでは１０００個にすることにします。逆に大きすぎても効果はあまり変わらなくなるので配列の個数は何度か実験してみて決定するようにします。

この配列は初期化されていませんので、関数 process_uline の最初に、

　initialize_unique();

を追加し、関数 initialize_unique を次のようにして各配列の値をヌルにします。

　initialize_unique()
　{
　　int i;

　　for(i=0;i<MAX_UNIQUE_DATA;i++)unique_data[i]=NULL;

　　return 1;
　}

関数 process_uline で関数 add_unique を呼び出してテキストを２分木に追加する部分は、これまでは、

　add_unique(text,unique_data);

でしたが、極力他の関数の変更部分を少なくするため、

　if(unique_data[index])add_unique(text,unique_data[index]);
　else　　　　　　　　　unique_data[index]=new_unique_data(text);

と変更します。配列の要素番号 index は int 型で定義しておきます。この index の値を固定（変化させない）すれば、これまでのプログラムと同じ２分木ができあがることになります。文字列によって変化させれば、小さい２分木がたくさんできることになります。

　この index の決め方が問題になります。上の部分の前の行に、

　index=get_unique_hash(text);

という１行を追加し、関数 get_unique_hash を次のようにしてみました。

　get_unique_hash(text)
　char *text;
　{
　　unsigned char *s;
　　unsigned int　hash;

　　for(s=(unsigned char *)text,hash=0;*s;s++)hash=hash+(*s);

　　return hash % MAX_UNIQUE_DATA;
　}

文字列の各文字の値を加算し、配列数で割った余りを配列の要素番号にするという計算です。unsigned char にしているのは２バイト文字のときには最上位ビットがオンになりますので、char 型のときには負の値になってしまうので常に正にするためです。
これで実験してみると思ったより値が分散してくれませんでした。また、

　abcde
　abced
　edcba

など、順番が違っていても同じ文字で構成されている文字列は同じ値になってしまうのでもう少し値が分散するように次のようにしてみました。

　get_unique_hash(text)
　char *text;
　{
　　unsigned char *s;
　　unsigned int　hash,i;

　　for(s=(unsigned char *)text,i=1,hash=0;*s;s++,i++)hash=hash+(*s)*i;

　　return hash % MAX_UNIQUE_DATA;
　}

これは各文字の値に何番目の文字かを乗算してその合計を求めて配列の要素数の剰余を配列の要素番号にするようにしたもので、結構値が分散したのでこれにしました。本当に文字列全体の値を加算する必要があるのかも疑問ですし、また他にももっと良い計算方法はあると思いますので工夫してみて下さい。

小さい２分木が作成された後、これらの２分木を１つにまとめなければなりませんので、関数 process_uline の display_unique の呼び出しの前にこの処理を行ないます。

　for(is=1;is<MAX_UNIQUE_DATA;is++)bind_unique(unique_data[is]);

　display_unique(unique_data[0]);

関数 bind_unique は指定された２分木を配列要素番号 0 の２分木に合成する動作をさせることにしますので、この関数に指定する２分木は配列要素番号を 1 からにします。
このループが終了すると、0 番目に全体の２分木ができますので、関数 display_uniqueには 0 番目の２分木のみを表示させます。

関数 bind_unique は次のようになります。

　bind_unique(udc)
　struct unique_data_strct *udc;
　{
　　if(!udc)return 1;

　　bind_unique(udc->left );
　　bind_unique(udc->right);

　　if(unique_data[0])add_unique(udc->text,unique_data[0]);
　　else　　　　　　　unique_data[0]=new_unique_data(udc->text);

　　DELETE(udc->text);
　　DELETE(udc　　　);

　　return 1;
　}

プログラムを簡単にするため、関数 add_unique と関数 new_unique_data はそのまま利用することにしました。これで完成と思ってプログラムを実行してみたら、個数が殆ど１になってしまいました。考えてみればこれらの関数は個数１つしか加算しないことを忘れていました。そこでこれらの関数を、

　add_unique(total,text,udc)
　int　total;
　char *text;
　struct unique_data_strct *udc;
　{
　　　 :
　　　 :

　new_unique_data(total,text)
　int　total;
　char *text;
　{
　　　 :
　　　 :

に変更し、任意の個数を初期値と加算するようにし、プログラム全体を直しました。これで完成です。実際に実験してみると、それまで３０秒もかかった時間が９秒に短縮され大幅な改善になりました。

最後にプログラムの全リストです。これまで説明してきた機能の他に、-? オプションで簡単なヘルプの表示を行なう機能と、各オプションはハイフォンでもスラッシュでも指定できるようにしてあります（ヘルプはハイフォンのみになっています）。ユニーク行の表示のときの個数の桁数を最低４桁としそれよりも大きいときには桁数を変化させるようにしています。またリストは私のリストそのままとしありますので簡単なコメントも残っています。タブは４タブのスペースに変換してあります。関数の順番はバラバラです。

#define VERSION_NUMBER "v1.0"

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <malloc.h>

#define NEW(type,element)　 (type *)malloc(sizeof(type)*(element))
#define DELETE(addr)　　　　free(addr)

#define READ_BUF_SIZE　　　 8192

#define PROC_MODE_ULINE　　 1
#define PROC_MODE_RLINE　　 2
#define PROC_MODE_HELP　　　3

int proc_mode=PROC_MODE_ULINE;
int disp_total=0;
int enclose　 =0;

char *fname=NULL;
FILE *fp=stdin;

#define MAX_UNIQUE_DATA　　 1000

struct unique_data_strct{
　　char *text;
　　int　total;
　　struct unique_data_strct *left,*right;
　　char ldepth,rdepth;
};
struct unique_data_strct *unique_data[MAX_UNIQUE_DATA];

#define INITIAL_DISP_TOTAL_LENGTH　 4
char disp_total_form[11]="%4d ";
int　max_total=0;

char *new_strcat(char *string1,char *string2);
struct unique_data_strct *new_unique_data(int total,char *text);

/*==================================== main ===================================
* Main routine.
*/
void main(argc,argv)
int　 argc;
char *argv[];
{
/*
*　 " Analize argment "
*/
　　if(analize_argment(argc,argv)== -1)goto end;
/*
*　 " Process "
*/
　　if(proc_mode==PROC_MODE_HELP){
　　　　process_help();
　　}
　　else{
　　　　if(open_source()== -1)goto end;

　　　　　　 if(proc_mode==PROC_MODE_ULINE)process_uline();
　　　　else if(proc_mode==PROC_MODE_RLINE)process_rline();

　　　　close_source();
　　}
/*
*　 " End of program "
*/
end:
　　exit(0);
}
/*============================== analize_argment ==============================
* Analize argment.
*/
analize_argment(argc,argv)
int　 argc;
char *argv[];
{
　　int i;

　　for(i=1;i<argc;i++){
　　　　　　 if(stricmp(argv[i],"-u")==0
　　　　　　 || stricmp(argv[i],"/u")==0)proc_mode=PROC_MODE_ULINE;
　　　　else if(stricmp(argv[i],"-r")==0
　　　　　　 || stricmp(argv[i],"/r")==0)proc_mode=PROC_MODE_RLINE;
　　　　else if(stricmp(argv[i],"-n")==0
　　　　　　 || stricmp(argv[i],"/n")==0)disp_total=1;
　　　　else if(stricmp(argv[i],"-c")==0
　　　　　　 || stricmp(argv[i],"/c")==0)enclose　 =1;
　　　　else if(stricmp(argv[i],"-?")==0
　　　　　　 || stricmp(argv[i],"/?")==0){
　　　　　　proc_mode=PROC_MODE_HELP;
　　　　　　return 1;
　　　　}
　　　　else{
　　　　　　for(;i<argc;i++){
　　　　　　　　if(fname && fname[0])fname=new_strcat(fname," ");
　　　　　　　　fname=new_strcat(fname,argv[i]);
　　　　　　}
　　　　}
　　}

　　return 1;
}
/*================================= new_strcat ================================
* strcat.
*/
char *new_strcat(char *string1,char *string2)
{
　　if(string1==NULL){
　　　　string1=NEW(char,strlen(string2)+1);
　　　　strcpy(string1,string2);
　　}
　　else{
　　　　int l;

　　　　l=strlen(string1)+strlen(string2)+1;

　　　　if(l>_msize(string1))string1=realloc(string1,l);

　　　　strcat(string1,string2);
　　}

　　return string1;
}
/*================================ open_source ================================
* Open source.
*/
open_source()
{
　　if(fname){
　　　　fp=fopen(fname,"r");
　　　　if(fp==NULL){
　　　　　　printf("入力ファイル\n");
　　　　　　printf("File : %s\n",fname);
　　　　　　printf("のオープン時にエラーが発生しました。\n");
　　　　　　printf("%s\n",strerror(errno));
　　　　　　return -1;
　　　　}
　　}

　　return 1;
}
/*================================ close_source ===============================
* Close source.
*/
close_source()
{
　　if(fname)fclose(fp);

　　return 1;
}
/*=============================== process_rline ===============================
* Process reject continued same text line.
*/
process_rline()
{
　　char　data[READ_BUF_SIZE+1];
　　int　ldata;

　　char *prev_text;
　　char * cur_text;
　　int　is,ie,total;

　　prev_text=new_strcat(NULL,"");
　　 cur_text=NULL;

　　total=0;
　　while(ldata=fread(data,1,READ_BUF_SIZE,fp)){
　　　　data[ldata]='\n';

　　　　for(is=ie=0;ie<=ldata;ie++){
　　　　　　if(data[ie]!='\n')continue;

　　　　　　data[ie]='\0';
　　　　　　cur_text=new_strcat(cur_text,data+is);
　　　　　　is=ie+1;

　　　　　　if(ie==ldata)break;

　　　　　　if(strcmp(prev_text,cur_text)!=0){
　　　　　　　　if(total)display_text(total,prev_text);

　　　　　　　　prev_text[0]='\0';
　　　　　　　　prev_text=new_strcat(prev_text,cur_text);

　　　　　　　　total=0;
　　　　　　}

　　　　　　total++;

　　　　　　cur_text[0]='\0';
　　　　}
　　}

　　if(total)display_text(total,prev_text);

　　DELETE(prev_text);
　　DELETE( cur_text);

　　return 1;
}
/*=============================== display_unique ==============================
* display unique.
*/
display_unique(udc)
struct unique_data_strct *udc;
{
　　if(udc->left )display_unique(udc->left );
　　display_text(udc->total,udc->text);
　　if(udc->right)display_unique(udc->right);

　　DELETE(udc->text);
　　DELETE(udc　　　);

　　return 1;
}
/*============================== new_unique_data ==============================
* Make new unique data.
*/
struct unique_data_strct *new_unique_data(total,text)
char *text;
{
　　struct unique_data_strct *udc;

　　udc=NEW(struct unique_data_strct,1);
　　udc->text=NEW(char,strlen(text)+1);
　　strcpy(udc->text,text);
　　udc->total=total;
　　udc->left =NULL;
　　udc->right=NULL;
　　udc->ldepth=0;
　　udc->rdepth=0;

　　return udc;
}
/*================================= add_unique ================================
* Add unique.
*/
add_unique(total,text,udc)
int total;
char *text;
struct unique_data_strct *udc;
{
　　int sts;

　　sts=strcmp(text,udc->text);
　　if(sts==0){
　　　　udc->total+=total;
　　　　if(udc->total>max_total)max_total=udc->total;
　　　　return 1;
　　}
　　else if(sts<0){
　　　　if(udc->left)add_unique(total,text,udc->left);
　　　　else　　　　 udc->left=new_unique_data(total,text);
　　}
　　else{
　　　　if(udc->right)add_unique(total,text,udc->right);
　　　　else　　　　　udc->right=new_unique_data(total,text);
　　}

　　set_unique_depth(udc);
　　rehash_unique(udc);

　　return 1;
}
/*============================== set_unique_depth =============================
* Set depth of unique data.
*/
set_unique_depth(udc)
struct unique_data_strct *udc;
{
　　if(udc->left){
　　　　udc->ldepth=(udc->left->ldepth > udc->left->rdepth ?
　　　　　　　　　　 udc->left->ldepth : udc->left->rdepth )+1;
　　}
　　else{
　　　　udc->ldepth=0;
　　}

　　if(udc->right){
　　　　udc->rdepth=(udc->right->ldepth > udc->right->rdepth ?
　　　　　　　　　　 udc->right->ldepth : udc->right->rdepth )+1;
　　}
　　else{
　　　　udc->rdepth=0;
　　}

　　return 1;
}
/*=============================== rehash_unique ===============================
* Rehash unique data.
*/
rehash_unique(udc)
struct unique_data_strct *udc;
{
　　struct unique_data_strct ut,*ul,*ur,*ulr,*url;
　　int diff;

　　diff=udc->ldepth - udc->rdepth;
　　if(diff==2){
　　　　ul=udc->left;

　　　　diff=ul->ldepth - ul->rdepth;
　　　　if(diff==1){
　　　　　　ut　= *udc;
　　　　　　*udc= *ul ;
　　　　　　*ul =　ut ;

　　　　　　ul->left　=udc->right;
　　　　　　udc->right=ul;
　　　　}
　　　　else{
　　　　　　ulr=ul->right;

　　　　　　ut　= *udc;
　　　　　　*udc= *ulr;
　　　　　　*ulr=　ut ;

　　　　　　ulr->left =udc->right;
　　　　　　udc->right=ulr;
　　　　　　ul->right =udc->left;
　　　　　　udc->left =ul;

　　　　　　set_unique_depth(ulr);
　　　　}

　　　　set_unique_depth(ul);
　　}
　　else if(diff== -2){
　　　　ur=udc->right;

　　　　diff=ur->ldepth - ur->rdepth;
　　　　if(diff== -1){
　　　　　　ut　= *udc;
　　　　　　*udc= *ur ;
　　　　　　*ur =　ut ;

　　　　　　ur->right=udc->left;
　　　　　　udc->left=ur;
　　　　}
　　　　else{
　　　　　　url=ur->left;

　　　　　　ut　= *udc;
　　　　　　*udc= *url;
　　　　　　*url=　ut ;

　　　　　　url->right=udc->left;
　　　　　　udc->left =url;
　　　　　　ur->left　=udc->right;
　　　　　　udc->right=ur;

　　　　　　set_unique_depth(url);
　　　　}

　　　　set_unique_depth(ur);
　　}
　　else{
　　　　return 1;
　　}

　　set_unique_depth(udc);

　　return 1;
}
/*============================= initialize_unique =============================
* Initialize unique data.
*/
initialize_unique()
{
　　int i;

　　for(i=0;i<MAX_UNIQUE_DATA;i++)unique_data[i]=NULL;

　　return 1;
}
/*============================== get_unique_hash ==============================
* Get (Put) hash value for unique data.
*/
get_unique_hash(text)
char *text;
{
　　unsigned char *s;
　　unsigned int　hash,i;

　　for(s=(unsigned char *)text,i=1,hash=0;*s;s++,i++)hash=hash+(*s)*i;

　　return hash % MAX_UNIQUE_DATA;
}
/*================================ bind_unique ================================
* Bind unique data.
*/
bind_unique(udc)
struct unique_data_strct *udc;
{
　　if(!udc)return 1;

　　bind_unique(udc->left );
　　bind_unique(udc->right);

　　if(unique_data[0])add_unique(udc->total,udc->text,unique_data[0]);
　　else　　　　　　　unique_data[0]=new_unique_data(udc->total,udc->text);

　　DELETE(udc->text);
　　DELETE(udc　　　);

　　return 1;
}
/*=============================== process_uline ===============================
* Process display unique line.
*/
process_uline()
{
　　char　data[READ_BUF_SIZE+1];
　　int　ldata;

　　char *text;
　　int　is,ie,index;

　　initialize_unique();

　　text=NULL;

　　while(ldata=fread(data,1,READ_BUF_SIZE,fp)){
　　　　data[ldata]='\n';

　　　　for(is=ie=0;ie<=ldata;ie++){
　　　　　　if(data[ie]!='\n')continue;

　　　　　　data[ie]='\0';
　　　　　　text=new_strcat(text,data+is);
　　　　　　is=ie+1;

　　　　　　if(ie==ldata)break;

　　　　　　index=get_unique_hash(text);

　　　　　　if(unique_data[index])add_unique(1,text,unique_data[index]);
　　　　　　else　　　　　　　　　unique_data[index]=new_unique_data(1,text);

　　　　　　text[0]='\0';
　　　　}
　　}

　　DELETE(text);

　　for(is=1;is<MAX_UNIQUE_DATA;is++)bind_unique(unique_data[is]);

　　set_display_total_form();

　　display_unique(unique_data[0]);

　　return 1;
}
/*================================ display_text ===============================
* Display text.
*/
display_text(total,text)
int　total;
char *text;
{
　　if(disp_total)printf(disp_total_form,total);

　　if(enclose)printf("(%s)\n",text);
　　else　　　 printf("%s\n"　,text);

　　return 1;
}
/*=========================== set_display_total_form ==========================
* Set dislay total form.
*/
set_display_total_form()
{
　　int i,l;

　　for(l=0,i=max_total;i>0;l++,i/=10);

　　if(l>INITIAL_DISP_TOTAL_LENGTH)sprintf(disp_total_form,"%%%dd ",l);

　　return 1;
}
/*================================ process_help ===============================
* Process help.
*/
process_help()
{
#define p printf

p("uline - ユニーク行表示プログラム %s\n",VERSION_NUMBER);
p("\n");
p("使用方法：\n");
p("\n");
p("　uline [-u|r|?] [-c] [-n] [ソーステキストファイル名]\n");
p("\n");
p("　-u　ユニーク行を表示します。\n");
p("\n");
p("　-r　連続する同一文字列行を削除して表示します。\n");
p("\n");
p("　-?　このヘルプを表示します。\n");
p("　　　本オプションを指定したときには他のすべてのオプションは無視されます。\n");
p("\n");
p("　-c　表示する文字列をカッコ () で囲みます。\n");
p("\n");
p("　-n　各ユニーク行の個数（-u）、または連続している同一文字列行の個数（-r）を表\n");
p("　　　示します。\n");
p("\n");
p("　ソーステキストファイル名\n");
p("　　　処理するソーステキストファイルの名称。本パラメータを省略すると標準入力か\n");
p("　　　らの読み込みになります。\n");
p("\n");
p("　∴ -u、-r、-? の指定がないときには -u として動作します。\n");

#undef p
}

それではまた次回。

Tags

Archive

第121回プログラミングについて『作っておくと便利なちょっとしたプログラムその5』

最近の投稿

Categories

Archive

Tags

そのお悩み、
アポロ技研に話してみませんか？

第121回 プログラミングについて『作っておくと便利なちょっとしたプログラム その5』

最近の投稿

Categories

Archive

Tags

そのお悩み、アポロ技研に話してみませんか？

第121回プログラミングについて『作っておくと便利なちょっとしたプログラムその5』

そのお悩み、
アポロ技研に話してみませんか？