第120回プログラミングについて『作っておくと便利なちょっとしたプログラムその4』

　　　　　　　　　　プログラミングについて　第１２０回目

　　　　　　　『作っておくと便利なちょっとしたプログラムその４』

　今回も前回に続いてプログラムを作っていきましょう。
まずは前回までのプログラムです。

　#include <stdio.h>
　#include <stdlib.h>
　#include <string.h>
　#include <malloc.h>

　#define NEW(type,element) (type *)malloc(sizeof(type)*(element))
　#define DELETE(addr)　　　free(addr)

　#define READ_BUF_SIZE　 8192

　#define PROC_MODE_ULINE　 1
　#define PROC_MODE_RLINE　 2

　int proc_mode=PROC_MODE_ULINE;
　int disp_total=0;
　int enclose　 =0;

　char *fname=NULL;
　FILE *fp=stdin;

　struct unique_data_strct{
　　char *text;
　　int　total;
　　struct unique_data_strct *next;
　};
　struct unique_data_strct *unique_data=NULL;

　char *new_strcat(char *string1,char *string2);

　void main(argc,argv)
　int　 argc;
　char *argv[];
　{
　　if(analize_argment(argc,argv)== -1)goto end;

　　if(open_source()== -1)goto end;

　　if(proc_mode==PROC_MODE_ULINE){
　　　process_uline();
　　}
　　else{
　　　process_rline();
　　}

　　close_source();

　end:
　　exit(0);
　}

　analize_argment(argc,argv)
　int　 argc;
　char *argv[];
　{
　　int i;

　　for(i=1;i<argc;i++){
　　　　　 if(stricmp(argv[i],"-u")==0)proc_mode=PROC_MODE_ULINE;
　　　else if(stricmp(argv[i],"-r")==0)proc_mode=PROC_MODE_RLINE;
　　　else if(stricmp(argv[i],"-n")==0)disp_total=1;
　　　else if(stricmp(argv[i],"-c")==0)enclose　 =1;
　　　else{
　　　　for(;i<argc;i++){
　　　　　if(fname && fname[0])fname=new_strcat(fname," ");
　　　　　fname=new_strcat(fname,argv[i]);
　　　　}
　　　}
　　}

　　return 1;
　}

　char *new_strcat(char *string1,char *string2)
　{
　　if(string1==NULL){
　　　string1=NEW(char,strlen(string2)+1);
　　　strcpy(string1,string2);
　　}
　　else{
　　　int l;

　　　l=strlen(string1)+strlen(string2)+1;

　　　if(l>_msize(string1))string1=realloc(string1,l);

　　　strcat(string1,string2);
　　}

　　return string1;
　}

　open_source()
　{
　　if(fname){
　　　fp=fopen(fname,"r");
　　　if(fp==NULL){
　　　　printf("入力ファイル\n");
　　　　printf("File : %s\n",fname);
　　　　printf("のオープン時にエラーが発生しました。\n");
　　　　printf("%s\n",strerror(errno));
　　　　return -1;
　　　}
　　}

　　return 1;
　}

　close_source()
　{
　　if(fname)fclose(fp);

　　return 1;
　}

　display_text(total,text)
　int　total;
　char *text;
　{
　　if(enclose){
　　　if(disp_total)printf("%4d (%s)\n",total,text);
　　　else　　　　　printf("(%s)\n",text);
　　}
　　else{
　　　if(disp_total)printf("%4d %s\n",total,text);
　　　else　　　　　printf("%s\n",text);
　　}

　　return 1;
　}

　process_rline()
　{
　　char　data[READ_BUF_SIZE+1];
　　int　ldata;

　　char *prev_text;
　　char * cur_text;
　　int　is,ie,total;

　　prev_text=new_strcat(NULL,"");
　　 cur_text=NULL;

　　total=0;
　　while(ldata=fread(data,1,READ_BUF_SIZE,fp)){
　　　data[ldata]='\n';

　　　for(is=ie=0;ie<=ldata;ie++){
　　　　if(data[ie]!='\n')continue;

　　　　data[ie]='\0';
　　　　cur_text=new_strcat(cur_text,data+is);
　　　　is=ie+1;

　　　　if(ie==ldata)break;

　　　　if(strcmp(prev_text,cur_text)!=0){
　　　　　if(total)display_text(total,prev_text);

　　　　　prev_text[0]='\0';
　　　　　prev_text=new_strcat(prev_text,cur_text);

　　　　　total=0;
　　　　}

　　　　total++;

　　　　cur_text[0]='\0';
　　　}
　　}

　　if(total)display_text(total,prev_text);

　　DELETE(prev_text);
　　DELETE( cur_text);

　　return 1;
　}

　process_uline()
　{
　char　data[READ_BUF_SIZE+1];
　int　ldata;

　char *text;
　int　is,ie;

　text=NULL;

　while(ldata=fread(data,1,READ_BUF_SIZE,fp)){
　　data[ldata]='\n';

　　for(is=ie=0;ie<=ldata;ie++){
　　　if(data[ie]!='\n')continue;

　　　data[ie]='\0';
　　　text=new_strcat(text,data+is);
　　　is=ie+1;

　　　if(ie==ldata)break;

　　　add_unique(text);

　　　text[0]='\0';
　　}

　　DELETE(text);

　　display_unique();

　　return 1;
　}

　add_unique(text)
　char *text;
　{
　　struct unique_data_strct *udp,*udc;

　　udp=NULL;
　　udc=unique_data;
　　while(udc){
　　　if(strcmp(udc->text,text)==0)break;

　　　udp=udc;
　　　udc=udc->next;
　　}

　　if(udc){
　　　udc->total++;
　　}
　　else{
　　　udc=NEW(struct unique_data_strct,1);
　　　udc->text=NEW(char,strlen(text)+1);
　　　strcpy(udc->text,text);
　　　udc->total=1;
　　　udc->next=NULL;

　　　if(udp==NULL)unique_data=udc;
　　　else　　　　 udp->next　=udc;
　　}

　　return 1;
　}

　display_unique()
　{
　　struct unique_data_strct *udc;

　　udc=unique_data;
　　while(udc){
　　　display_text(udc->total,udc->text);
　　　udc=udc->next;
　　}

　　return 1;
　}

前回は単純なリスト構造にしていたことが原因で表示が開始されるまでの時間がかかっていました。そこで今回は文字列の検索方法を変えて実行時間を短縮することを考えてみましょう。

考えられる検索方法は２分木を使うのが簡単で効果がありそうです。
そこで、 unique_data_strct 構造体を次のように変えます。

　struct unique_data_strct{
　　char *text;
　　int　total;
　　struct unique_data_strct *left,*right;
　};
　struct unique_data_strct *unique_data=NULL;

関数 process_uline も次のように変更します。

　process_uline()
　{
　　char　data[READ_BUF_SIZE+1];
　　int　ldata;

　　char *text;
　　int　is,ie;

　　text=NULL;

　　while(ldata=fread(data,1,READ_BUF_SIZE,fp)){
　　　data[ldata]='\n';

　　　for(is=ie=0;ie<=ldata;ie++){
　　　　if(data[ie]!='\n')continue;

　　　　data[ie]='\0';
　　　　text=new_strcat(text,data+is);
　　　　is=ie+1;

　　　　if(ie==ldata)break;

　　　　add_unique(text,unique_data);　　　/* 変更 */

　　　　text[0]='\0';
　　　}
　　}

　　DELETE(text);

　　display_unique(unique_data);　　　　　 /* 変更 */

　　return 1;
　}

次に関数 add_unique を次のようにします。この関数は再帰呼び出しの表現になっています。

　add_unique(text,udc)
　char *text;
　struct unidue_data_strct *udc;
　{
　　int sts;

　　if(udc==NULL){
　　　unique_data=new_unique_data(text);
　　}
　　else{
　　　sts=strcmp(text,udc->text);
　　　if(sts==0){
　　　　udc->total++;
　　　}
　　　else if(sts<0){
　　　　if(udc->left)add_unique(text,udc->left);
　　　　else　　　　 udc->left=new_unique_data(text);
　　　}
　　　else{
　　　　if(udc->right)add_unique(text,udc->right);
　　　　else　　　　　udc->right=new_unique_data(text);
　　　}
　　}

　　return 1;
　}

関数 new_unique_data は次のようにします。関数の型が int ではないので、これは予めプロトタイプの宣言をしておく必要があります。

　struct unique_data_strct *new_unique_data(text)
　char *text;
　{
　　struct unique_data_strct *udc;

　　udc=NEW(struct unique_data_strct,1);
　　udc->text=NEW(char,strlen(text)+1);
　　strcpy(udc->text,text);
　　udc->total=1;
　　udc->left =NULL;
　　udc->right=NULL;
　　udc->ldepth=0;
　　udc->rdepth=0;

　　return udc;
　}

更に表示を行なう関数 display_unique は再帰呼び出しで表現します。

　display_unique(udc)
　struct unique_data_strct *udc;
　{
　　if(udc->left )display_unique(udc->left );
　　display_text(udc->total,udc->text);
　　if(udc->right)display_unique(udc->right);

　　return 1;
　}

構造体の変更も含めて、上記の変更で検索の方法が２分木になりました。このプログラムで前回１分間かかってしまったテキストファイルを実行してみると、表示が開始されるまでの時間が２秒に短縮されました。更に２分木を上記のように表示すると文字列の小さい順から表示されるのでソートがかかっていることになり一石二鳥の大幅な改善になりました。元のテキストファイルは２５万行程度でした。このプログラムの結果を、

　uline 元ファイル > 結果ファイル

として結果をリダイレクションを行い結果ファイルに書き込んでみました。結果の行数は６千行程度になりました。このデータはソートがかかっているので、

　uline 結果ファイル

として実行してみると、今度は６秒もかかってしまいました。データの行数が５０分の１になっているのに実行時間が逆に増えてしまったのは、２分木の最悪の状態であるリスト構造と同一になっているためです。これもなんとかしなければなりません。

そこで、プログラミング５５回目の２分木の話しを思い出して下さい。バランスが悪くなった２分木を最適化する関数が使えそうです。殆どそれを利用して実験してみると、２５万行の元のデータの表示までの時間は殆ど変りなく、ソートがかかった６千行のデータでは測定できないぐらいの一瞬でした。

長くなりますが全リストです。

　#include <stdio.h>
　#include <stdlib.h>
　#include <string.h>
　#include <malloc.h>

　#define NEW(type,element) (type *)malloc(sizeof(type)*(element))
　#define DELETE(addr)　　　free(addr)

　#define READ_BUF_SIZE　 8192

　#define PROC_MODE_ULINE　 1
　#define PROC_MODE_RLINE　 2

　int proc_mode=PROC_MODE_ULINE;
　int disp_total=0;
　int enclose　 =0;

　char *fname=NULL;
　FILE *fp=stdin;

　struct unique_data_strct{
　　char *text;
　　int　total;
　　struct unique_data_strct *left,*right;
　　char ldepth,rdepth;
　};
　struct unique_data_strct *unique_data=NULL;

　char *new_strcat(char *string1,char *string2);
　struct unique_data_strct *new_unique_data(char *text);

　void main(argc,argv)
　int　 argc;
　char *argv[];
　{
　　if(analize_argment(argc,argv)== -1)goto end;

　　if(open_source()== -1)goto end;

　　if(proc_mode==PROC_MODE_ULINE)process_uline();
　　else　　　　　　　　　　　　　process_rline();

　　close_source();

　end:
　　exit(0);
　}

　analize_argment(argc,argv)
　int　 argc;
　char *argv[];
　{
　　int i;

　　for(i=1;i<argc;i++){
　　　　　 if(stricmp(argv[i],"-u")==0)proc_mode=PROC_MODE_ULINE;
　　　else if(stricmp(argv[i],"-r")==0)proc_mode=PROC_MODE_RLINE;
　　　else if(stricmp(argv[i],"-n")==0)disp_total=1;
　　　else if(stricmp(argv[i],"-c")==0)enclose　 =1;
　　　else{
　　　　for(;i<argc;i++){
　　　　　if(fname && fname[0])fname=new_strcat(fname," ");
　　　　　fname=new_strcat(fname,argv[i]);
　　　　}
　　　}
　　}

　　return 1;
　}

　char *new_strcat(char *string1,char *string2)
　{
　　if(string1==NULL){
　　　string1=NEW(char,strlen(string2)+1);
　　　strcpy(string1,string2);
　　}
　　else{
　　　int l;

　　　l=strlen(string1)+strlen(string2)+1;

　　　if(l>_msize(string1))string1=realloc(string1,l);

　　　strcat(string1,string2);
　　}

　　return string1;
　}

　open_source()
　{
　　if(fname){
　　　fp=fopen(fname,"r");
　　　if(fp==NULL){
　　　　printf("入力ファイル\n");
　　　　printf("File : %s\n",fname);
　　　　printf("のオープン時にエラーが発生しました。\n");
　　　　printf("%s\n",strerror(errno));
　　　　return -1;
　　　}
　　}

　　return 1;
　}

　close_source()
　{
　　if(fname)fclose(fp);

　　return 1;
　}

　display_text(total,text)
　int　total;
　char *text;
　{
　　if(enclose){
　　　if(disp_total)printf("%4d (%s)\n",total,text);
　　　else　　　　　printf("(%s)\n",text);
　　}
　　else{
　　　if(disp_total)printf("%4d %s\n",total,text);
　　　else　　　　　printf("%s\n",text);
　　}

　　return 1;
　}

　process_rline()
　{
　　char　data[READ_BUF_SIZE+1];
　　int　ldata;

　　char *prev_text;
　　char * cur_text;
　　int　is,ie,total;

　　prev_text=new_strcat(NULL,"");
　　 cur_text=NULL;

　　total=0;
　　while(ldata=fread(data,1,READ_BUF_SIZE,fp)){
　　　data[ldata]='\n';

　　　for(is=ie=0;ie<=ldata;ie++){
　　　　if(data[ie]!='\n')continue;

　　　　data[ie]='\0';
　　　　cur_text=new_strcat(cur_text,data+is);
　　　　is=ie+1;

　　　　if(ie==ldata)break;

　　　　if(strcmp(prev_text,cur_text)!=0){
　　　　　if(total)display_text(total,prev_text);

　　　　　prev_text[0]='\0';
　　　　　prev_text=new_strcat(prev_text,cur_text);

　　　　　total=0;
　　　　}

　　　　total++;

　　　　cur_text[0]='\0';
　　　}
　　}

　　if(total)display_text(total,prev_text);

　　DELETE(prev_text);
　　DELETE( cur_text);

　　return 1;
　}

　process_uline()
　{
　　char　data[READ_BUF_SIZE+1];
　　int　ldata;

　　char *text;
　　int　is,ie;

　　text=NULL;

　　while(ldata=fread(data,1,READ_BUF_SIZE,fp)){
　　　data[ldata]='\n';

　　　for(is=ie=0;ie<=ldata;ie++){
　　　　if(data[ie]!='\n')continue;

　　　　data[ie]='\0';
　　　　text=new_strcat(text,data+is);
　　　　is=ie+1;

　　　　if(ie==ldata)break;

　　　　add_unique(text,unique_data);

　　　　text[0]='\0';
　　　}
　　}

　　DELETE(text);

　　display_unique(unique_data);

　　return 1;
　}

　display_unique(udc)
　struct unique_data_strct *udc;
　{
　　if(udc->left )display_unique(udc->left );
　　display_text(udc->total,udc->text);
　　if(udc->right)display_unique(udc->right);

　　return 1;
　}

　struct unique_data_strct *new_unique_data(text)
　char *text;
　{
　　struct unique_data_strct *udc;

　　udc=NEW(struct unique_data_strct,1);
　　udc->text=NEW(char,strlen(text)+1);
　　strcpy(udc->text,text);
　　udc->total=1;
　　udc->left =NULL;
　　udc->right=NULL;
　　udc->ldepth=0;
　　udc->rdepth=0;

　　return udc;
　}

　add_unique(text,udc)
　char *text;
　struct unique_data_strct *udc;
　{
　　int sts;

　　if(udc==NULL){
　　　unique_data=new_unique_data(text);
　　}
　　else{
　　　sts=strcmp(text,udc->text);
　　　if(sts==0){
　　　　udc->total++;
　　　　return 1;
　　　}
　　　else if(sts<0){
　　　　if(udc->left)add_unique(text,udc->left);
　　　　else　　　　 udc->left=new_unique_data(text);
　　　}
　　　else{
　　　　if(udc->right)add_unique(text,udc->right);
　　　　else　　　　　udc->right=new_unique_data(text);
　　　}

　　　set_unique_depth(udc);
　　　　 rehash_unique(udc);
　　}

　　return 1;
　}

　set_unique_depth(udc)
　struct unique_data_strct *udc;
　{
　　if(udc->left){
　　　udc->ldepth=(udc->left->ldepth > udc->left->rdepth ?
　　　　　　　　　 udc->left->ldepth : udc->left->rdepth )+1;
　　}
　　else{
　　　udc->ldepth=0;
　　}

　　if(udc->right){
　　　udc->rdepth=(udc->right->ldepth > udc->right->rdepth ?
　　　　　　　　　 udc->right->ldepth : udc->right->rdepth )+1;
　　}
　　else{
　　　udc->rdepth=0;
　　}

　　return 1;
　}

　rehash_unique(udc)
　struct unique_data_strct *udc;
　{
　　struct unique_data_strct ut,*ul,*ur,*ulr,*url;
　　int diff;

　　diff=udc->ldepth - udc->rdepth;
　　if(diff==2){
　　　ul=udc->left;

　　　diff=ul->ldepth - ul->rdepth;
　　　if(diff==1){
　　　　ut　= *udc;
　　　　*udc= *ul ;
　　　　*ul =　ut ;

　　　　ul->left　=udc->right;
　　　　udc->right=ul;
　　　}
　　　else{
　　　　ulr=ul->right;

　　　　ut　= *udc;
　　　　*udc= *ulr;
　　　　*ulr=　ut ;

　　　　ulr->left =udc->right;
　　　　udc->right=ulr;
　　　　ul->right =udc->left;
　　　　udc->left =ul;

　　　　set_unique_depth(ulr);
　　　}

　　　set_unique_depth(ul);
　　}
　　else if(diff== -2){
　　　ur=udc->right;

　　　diff=ur->ldepth - ur->rdepth;
　　　if(diff== -1){
　　　　ut　= *udc;
　　　　*udc= *ur ;
　　　　*ur =　ut ;

　　　　ur->right=udc->left;
　　　　udc->left=ur;
　　　}
　　　else{
　　　　url=ur->left;

　　　　ut　= *udc;
　　　　*udc= *url;
　　　　*url=　ut ;

　　　　url->right=udc->left;
　　　　udc->left =url;
　　　　ur->left　=udc->right;
　　　　udc->right=ur;

　　　　set_unique_depth(url);
　　　}

　　　set_unique_depth(ur);
　　}
　　else{
　　　return 1;
　　}

　　set_unique_depth(udc);

　　return 1;
　}

ここまでで十分に実用になります。しかしもう少しこだわりをもつことにしましょう。
色々と実験してみると、１５０万行程度のテキストデータでは３０秒もかかってしまうことがどうしても納得いかないのです。考えてみれば１５０万個のデータの分類と個数の計測が、ファイルからの読み込み時間も含めて３０秒で終了するというのはこれで問題ないようなのですが、最後のこだわりを次回で考えていきましょう。

それではまた次回。

Tags

Archive

第120回プログラミングについて『作っておくと便利なちょっとしたプログラムその4』

最近の投稿

Categories

Archive

Tags

そのお悩み、
アポロ技研に話してみませんか？

第120回 プログラミングについて『作っておくと便利なちょっとしたプログラム その4』

最近の投稿

Categories

Archive

Tags

そのお悩み、アポロ技研に話してみませんか？

第120回プログラミングについて『作っておくと便利なちょっとしたプログラムその4』

そのお悩み、
アポロ技研に話してみませんか？