OAIP/lab26/TextProcessingFile/TextProcessingFile.c
2024-12-06 18:45:55 +04:00

259 lines
7.3 KiB
C
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#define _CRT_SECURE_NO_WARNINGS
#include <string.h>
#include <time.h>
#include <stdio.h>
#include <Windows.h>
// Ìàêñèìàëüíàÿ äëèíà ñëîâà â ñëîâàðå
#define MAX_LEN_WORD 128
int getNextDelim(FILE* fp, char token[]);
int getNextWord(FILE* fp, char token[], int maxLen);
int TextProcessing(char* filenameIn, char* filenameOut);
char filenameDict[MAX_PATH] = "../Dictionaries/dict0.txt";
char filenameIn[MAX_PATH] = "../Texts/Alice.txt";
char filenameOut[MAX_PATH] = "out/Alice_out.html";
//char filenameIn[] = "c:\\Temp\\FIST2024\\TextMarkup\\Tolkien2.txt";
//char filenameOut[] = "c:\\Temp\\FIST2024\\TextMarkup\\Tolkien2_out.html";
void test(char filenameDict[], char filenameIn[], char filenameOut[]) {
// ñîîáùàåì êàêèå ôàéëû îáðàáàòûâàþòñÿ
printf("HTML file %s\nis created from text file %s\nwith highlighting words from %s\ndictionary\n\n",
filenameOut, filenameIn, filenameDict);
// t0 - ñêîëüêî ïðîøëî âðåìåíè îò ñòàðòà ïðîãðàììû äî ìîìåíòà âõîäà â ôóíêöèþ main()
long t0 = clock();
printf("t0 = %.3f sec \n", t0 / (float)CLOCKS_PER_SEC);
// t1 - ñêîëüêî ïðîøëî âðåìåíè îò ñòàðòà ïðîãðàììû äî îêîí÷àíèÿ çàãðóçêè ñëîâàðÿ
int t1 = clock();
printf("t1 = %.3f sec \n", t1 / (float)CLOCKS_PER_SEC);
TextProcessing(filenameIn, filenameOut);
// t2 - ñêîëüêî ïðîøëî âðåìåíè îò ñòàðòà ïðîãðàììû äî îêîí÷àíèÿ êîíâåðòàöèè òåêñòà
long t2 = clock();
printf("t2 = %.3f sec \n", t2 / (float)CLOCKS_PER_SEC);
// t3 - ñêîëüêî ïðîøëî âðåìåíè îò îêîí÷àíèÿ êîíâåðòàöèè òåêñòà äî îêîí÷àíèÿ óíè÷òîæåíèÿ ñëîâàðÿ
long t3 = clock();
printf("t3 = %.3f sec \n", t3 / (float)CLOCKS_PER_SEC);
printf("t1 - t0 = %.3f sec (Run time of dictionary loading)\n", (t1 - t0) / (float)CLOCKS_PER_SEC);
printf("t2 - t1 = %.3f sec (Run time of HTML generating)\n", (t2 - t1) / (float)CLOCKS_PER_SEC);
printf("t3 - t2 = %.3f sec (Run time of dictionary destroying )\n\n\n", (t2 - t1) / (float)CLOCKS_PER_SEC);
}
int main() {
//char filenameDict[MAX_PATH] = "../Dictionaries/dict0.txt";
//char filenameIn[MAX_PATH] = "../Texts/Alice.txt";
//char filenameOut[MAX_PATH] = "out/Alice_out.html";
for (int i = 0; i < 1; i++) {
/*sprintf(filenameDict, "../Dictionaries/dict%d.txt", i);
test(filenameDict, filenameIn, filenameOut);
sprintf(filenameDict, "../Dictionaries/dict%da.txt", i);
test(filenameDict, filenameIn, filenameOut);
sprintf(filenameDict, "../Dictionaries/dict%db.txt", i);
test(filenameDict, filenameIn, filenameOut);*/
strcpy(filenameIn, "../Texts/Tolkien.txt");
strcpy(filenameOut, "out/Tolkien_out.html");
sprintf(filenameDict, "../Dictionaries/dict0.txt");
test(filenameDict, filenameIn, filenameOut);
sprintf(filenameDict, "../Dictionaries/dict0a.txt");
test(filenameDict, filenameIn, filenameOut);
sprintf(filenameDict, "../Dictionaries/dict0b.txt");
test(filenameDict, filenameIn, filenameOut);
strcpy(filenameIn, "../Texts/Tolkien2.txt");
strcpy(filenameOut, "out/Tolkien2_out.html");
sprintf(filenameDict, "../Dictionaries/dict0.txt");
test(filenameDict, filenameIn, filenameOut);
sprintf(filenameDict, "../Dictionaries/dict0a.txt");
test(filenameDict, filenameIn, filenameOut);
sprintf(filenameDict, "../Dictionaries/dict0b.txt");
test(filenameDict, filenameIn, filenameOut);
}
return 0;
}
// Ïðîâåðêà, åñòü ëè ñëîâî word â ñëîâàðå, õðàíÿùåìñÿ â ôàéëå filenameDict
int Member(char* word) {
// îòêðûòü ôàéë
FILE* fin = fopen(filenameDict, "rt");
if (fin == NULL) {
// åñëè ôàéë íå ñìîãëè îòêðûòü - ñîîáùàåì îá ýòîì
printf("File %s doesn't opened!\n", filenameDict);
return 0;
}
char token[MAX_LEN_WORD];
// ïîêà íå êîíåö ôàéëà
while (!feof(fin)) {
// ïîêà åñòü ðàçäåëèòåëü - áåðåì åãî
while (getNextDelim(fin, token)) {
}
// åñëè åñòü ñëîâî - áåðåì åãî
if (getNextWord(fin, token, MAX_LEN_WORD)) {
if (strcmp(token, word) == 0) {
// Ñëîâî â ôàéëå åñòü!
fclose(fin);
return 1;
}
}
}
// Çàêðûâàåì ôàéë ñ òåêñòîì
fclose(fin);
return 0;
}
int TextProcessing(char* filenameIn, char* filenameOut) {
// îòêðûòü ôàéë
FILE* fin = fopen(filenameIn, "rt");
if (fin == NULL) {
// åñëè ôàéë íå ñìîãëè îòêðûòü - ñîîáùàåì îá ýòîì
printf("File %s doesn't opened!\n", filenameIn);
return 0;
}
// îòêðûòü ôàéë
FILE* fout = fopen(filenameOut, "wt");
if (fout == NULL) {
// åñëè ôàéë íå ñìîãëè îòêðûòü - ñîîáùàåì îá ýòîì
printf("File %s doesn't opened!\n", filenameOut);
fclose(fin);
return 0;
}
// Âûâîäèì â âûõîäíîé ôàéë çàãîëîâîê HTML äîêóìåíòà
fprintf(fout, "<!DOCTYPE html>");
fprintf(fout, "<html>");
fprintf(fout, "<head>");
fprintf(fout, "<meta http-equiv = \"Content-Type\" content = \"text/html; charset=cp1251\" />");
fprintf(fout, "<title>HTML Document</title>");
fprintf(fout, "</head>");
fprintf(fout, "<body>");
char token[MAX_LEN_WORD];
// ïîêà íå êîíåö ôàéëà
while (!feof(fin)) {
// ïîêà åñòü ðàçäåëèòåëü - áåðåì åãî
while (getNextDelim(fin, token)) {
// âûâîäèì ðàçäåëèòåëü
if (strcmp(token, "<") == 0) {
fprintf(fout, "&lt");
} else if (strcmp(token, ">") == 0) {
fprintf(fout, "&gt");
}
else {
if (strcmp(token, "\n") == 0) {
fprintf(fout, "<br>");
}
fprintf(fout, "%s", token);
}
}
// åñëè åñòü ñëîâî - áåðåì åãî
if (getNextWord(fin, token, MAX_LEN_WORD)) {
// Åñëè ñëîâî åñòü â Ñëîâàðå òî âûäåëÿåì åãî
if (Member(token)) {
fprintf(fout, "<b>%s</b>", token);
}
else {
fprintf(fout, "%s", token);
}
}
}
// âûâîäèò â HTML çàâåðøàþùèå òåãè äîêóìåíòà HTML
fprintf(fout, "</body>");
fprintf(fout, "</html>");
// çàêðûâàåì âõîäíîé è âûõîäíîé ôàéëà
fclose(fin);
fclose(fout);
return 1;
}
int isalpha_my(unsigned char ch);
// Âîçâðàùàåò 1 - åñëè èç ôàéëà ïðî÷èòàí ðàçäåëèòåëü.
//  ýòîì ñëó÷àå â token âîçâðàùàåòñÿ ñòðîêà, ñîäåðæàùàÿ
// ýòîò ðàçäåëèòåëü.
// Åñëè â ôàéëå áûë íå ðàçäåëèòåëü - âîçâðàùàåò 0.
//  ýòîì ñëó÷àå ñîñòîÿíèå token íåîïðåäåëåíî.
int getNextDelim(FILE* fp, char token[])
{
int ch = getc(fp);
if (ch == EOF) {
return 0;
}
if (isalpha_my((unsigned char)ch)) {
ungetc(ch, fp);
return 0;
}
token[0] = (unsigned char)ch;
token[1] = '\0';
return 1;
}
// Âîçâðàùàåò 1 - åñëè èç ôàéëà ïðî÷èòàíî ñëîâî.
//  ýòîì ñëó÷àå â token âîçâðàùàåòñÿ ñòðîêà, ñîäåðæàùàÿ
// ýòî ñëîâî. Ãàðàíòèðóåòñÿ ÷òî ñëîâî íå áîëååå maxLen ñèìâîëîâ.
// Åñëè â ôàéëå íå áûëî áóêâû - âîçâðàùàåò 0.
//  ýòîì ñëó÷àå token ñîäåðæèò ïóñòóþ ñòðîêó.
int getNextWord(FILE* fp, char token[], int maxLen)
{
int i = 0;
int ch;
while (((ch = getc(fp)) != EOF) && (i < maxLen - 1)) {
if (!isalpha_my((unsigned char)(ch))) {
break;
}
token[i++] = ch;
}
ungetc(ch, fp);
token[i] = '\0';
if (i == 0)
return 0;
return 1;
}
// Âîçâðàùàåò 0 - åñëè ch - íå áóêâà.
// Âîçâðàùàåò 1 - åñëè ch - áóêâà.
// Êîððåêòíî ðàáîòàåò äëÿ ëàòèíñêèõ áóêâ (ñ êîäàìè < 128)
// È äëÿ ðóññêèõ áóêâ èç êîäèðîâêè ANSI
int isalpha_my(unsigned char ch) {
if (isalpha(ch))
return 1;
// ANSI êîäèðîâêà!!!
if (ch >= 192 && ch <= 223)
return 1;
if (ch >= 224 && ch <= 255)
return 1;
/*
if (ch >= 'À' && ch <= 'ß') return 1;
if (ch >= 'à' && ch <= 'ï') return 1;
if (ch >= 'ð' && ch <= 'ÿ')return 1;
if (ch == '¸' ) return 1;
if (ch == '¨') return 1;*/
return 0;
}