OAIP_Mirror/lab26/TextProcessingDict/TextProcessing.c
2024-12-06 23:11:06 +04:00

237 lines
5.7 KiB
C
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#define _CRT_SECURE_NO_WARNINGS
#include <stdio.h>
#include <string.h>
#include <time.h>
#include <Windows.h>
#include "Dict.h"
int getNextDelim(FILE* fp, char token[]);
int getNextWord(FILE* fp, char token[], int maxLen);
int LoadDictionary(char* filename);
int TextProcessing(char* filenameIn, char* filenameOut);
char filenameDict[MAX_PATH] = "../Dictionaries/dict0.txt";
char filenameIn[MAX_PATH] = "../Texts/Alice.txt";
char filenameOut[MAX_PATH] = "out/Alice_out.html";
double results[3][15];
void test(int i, int j) {
// ñîîáùàåì êàêèå ôàéëû îáðàáàòûâàþòñÿ
printf("HTML = %s\ntext = %s\ndict = %s\n",
filenameOut, filenameIn, filenameDict);
LoadDictionary(filenameDict);
clock_t t0 = clock();
TextProcessing(filenameIn, filenameOut);
clock_t t1 = clock();
Destroy();
double runtime = (t1 - t0) / (double)CLOCKS_PER_SEC;
results[i][j] = runtime;
printf("t1 - t0 = %.3f sec (Run time of HTML generating)\n\n", runtime);
}
void test_dicts(int i) {
for (int j = 0; j < 5; j++) {
sprintf(filenameDict, "../Dictionaries/dict%d.txt", j);
test(i, j);
sprintf(filenameDict, "../Dictionaries/dict%da.txt", j);
test(i, j + 5);
sprintf(filenameDict, "../Dictionaries/dict%db.txt", j);
test(i, j + 10);
}
}
int main() {
strcpy(filenameIn, "../Texts/Alice.txt");
strcpy(filenameOut, "out/Alice_out.html");
test_dicts(0);
strcpy(filenameIn, "../Texts/Tolkien.txt");
strcpy(filenameOut, "out/Tolkien_out.html");
test_dicts(1);
strcpy(filenameIn, "../Texts/Tolkien2.txt");
strcpy(filenameOut, "out/Tolkien2_out.html");
test_dicts(2);
for (int i = 0; i < 3; i++) {
for (int j = 0; j < 15; j++) {
printf("%7.3lf ", results[i][j]);
}
printf("\n");
}
printf("\a");
return 0;
}
int LoadDictionary(char* filename) {
// îòêðûòü ôàéë
FILE* fin = fopen(filename, "r");
if (fin == NULL) {
// åñëè ôàéë íå ñìîãëè îòêðûòü - ñîîáùàåì îá ýòîì
printf("File %s didn't open!\n", filename);
return 0;
}
Create();
char token[MAX_LEN_WORD];
// ïîêà íå êîíåö ôàéëà
while (!feof(fin)) {
// ïîêà åñòü ðàçäåëèòåëü - áåðåì åãî
while (getNextDelim(fin, token)) {
}
// åñëè åñòü ñëîâî - áåðåì åãî
if (getNextWord(fin, token, MAX_LEN_WORD)) {
Insert(token);
}
}
// Çàêðûâàåì ôàéë ñ òåêñòîì
fclose(fin);
return 1;
}
int TextProcessing(char* filenameIn, char* filenameOut) {
// îòêðûòü ôàéë âõîäíîé
FILE* fin = fopen(filenameIn, "r");
if (fin == NULL) {
// åñëè ôàéë íå ñìîãëè îòêðûòü - ñîîáùàåì îá ýòîì
printf("File %s doesn't opened!\n", filenameIn);
return 0;
}
// îòêðûòü ôàéë âûõîäíîé
FILE* fout = fopen(filenameOut, "w");
if (fout == NULL) {
// åñëè ôàéë íå ñìîãëè îòêðûòü - ñîîáùàåì îá ýòîì
printf("File %s doesn't opened!\n", filenameOut);
// è çàêðûâàåì âõîäíîé ôàéë
fclose(fin);
return 0;
}
// Âûâîäèì â âûõîäíîé ôàéë çàãîëîâîê HTML äîêóìåíòà
fprintf(fout, "<!DOCTYPE html>");
fprintf(fout, "<html>");
fprintf(fout, "<head>");
fprintf(fout, "<meta http-equiv = \"Content-Type\" content = \"text/html; charset=cp1251\" />");
fprintf(fout, "<title>HTML Document</title>");
fprintf(fout, "</head>");
fprintf(fout, "<body>");
char token[MAX_LEN_WORD];
// ïîêà íå êîíåö ôàéëà
while (!feof(fin)) {
// ïîêà åñòü ðàçäåëèòåëü - áåðåì åãî
while (getNextDelim(fin, token)) {
// âûâîäèì ðàçäåëèòåëü
if (strcmp(token, "<") == 0) {
fprintf(fout, "&lt");
}
else if (strcmp(token, ">") == 0) {
fprintf(fout, "&gt");
}
else {
if (strcmp(token, "\n") == 0) {
fprintf(fout, "<br>");
}
fprintf(fout, "%s", token);
}
}
// åñëè åñòü ñëîâî - áåðåì åãî
if (getNextWord(fin, token, MAX_LEN_WORD)) {
// Åñëè ñëîâî åñòü â Ñëîâàðå òî âûäåëÿåì åãî
if (Member(token)) {
fprintf(fout, "<b>%s</b>", token);
}
else {
fprintf(fout, "%s", token);
}
}
}
// âûâîäèò â HTML çàâåðøàþùèå òåãè äîêóìåíòà HTML
fprintf(fout, "</body>");
fprintf(fout, "</html>");
// çàêðûâàåì âõîäíîé è âûõîäíîé ôàéëà
fclose(fin);
fclose(fout);
return 1;
}
int isalpha_my(unsigned char ch);
// Âîçâðàùàåò 1 - åñëè èç ôàéëà ïðî÷èòàí ðàçäåëèòåëü.
//  ýòîì ñëó÷àå â token âîçâðàùàåòñÿ ñòðîêà, ñîäåðæàùàÿ
// ýòîò ðàçäåëèòåëü.
// Åñëè â ôàéëå áûë íå ðàçäåëèòåëü - âîçâðàùàåò 0.
//  ýòîì ñëó÷àå ñîñòîÿíèå token íåîïðåäåëåíî.
int getNextDelim(FILE* fp, char token[])
{
int ch = getc(fp);
if (ch == EOF) {
return 0;
}
if (isalpha_my((unsigned char)ch)) {
ungetc(ch, fp);
return 0;
}
token[0] = (unsigned char)ch;
token[1] = '\0';
return 1;
}
// Âîçâðàùàåò 1 - åñëè èç ôàéëà ïðî÷èòàíî ñëîâî.
//  ýòîì ñëó÷àå â token âîçâðàùàåòñÿ ñòðîêà, ñîäåðæàùàÿ
// ýòî ñëîâî. Ãàðàíòèðóåòñÿ ÷òî ñëîâî íå áîëååå maxLen ñèìâîëîâ.
// Åñëè â ôàéëå íå áûëî áóêâû - âîçâðàùàåò 0.
//  ýòîì ñëó÷àå token ñîäåðæèò ïóñòóþ ñòðîêó.
int getNextWord(FILE* fp, char token[], int maxLen)
{
int i = 0;
int ch;
while (((ch = getc(fp)) != EOF) && (i < maxLen - 1)) {
if (!isalpha_my((unsigned char)(ch))) {
break;
}
token[i++] = ch;
}
ungetc(ch, fp);
token[i] = '\0';
if (i == 0)
return 0;
return 1;
}
// Âîçâðàùàåò 0 - åñëè ch - íå áóêâà.
// Âîçâðàùàåò 1 - åñëè ch - áóêâà.
// Êîððåêòíî ðàáîòàåò äëÿ ëàòèíñêèõ áóêâ (ñ êîäàìè < 128)
// È äëÿ ðóññêèõ áóêâ èç êîäèðîâêè ANSI
int isalpha_my(unsigned char ch) {
if (isalpha(ch))
return 1;
// ANSI êîäèðîâêà!!!
if (ch >= 192 && ch <= 223)
return 1;
if (ch >= 224 && ch <= 255)
return 1;
/*
if (ch >= 'À' && ch <= 'ß') return 1;
if (ch >= 'à' && ch <= 'ï') return 1;
if (ch >= 'ð' && ch <= 'ÿ')return 1;
if (ch == '¸' ) return 1;
if (ch == '¨') return 1;*/
return 0;
}