Browse Source

First version

Piotr Czajkowski 7 years ago
parent
commit
ff36614bb9
15 changed files with 320 additions and 2 deletions
  1. 16 2
      README.md
  2. 14 0
      anonymize.c
  3. 64 0
      comments.c
  4. 3 0
      comments.h
  5. 46 0
      dict.c
  6. 15 0
      dict.h
  7. 28 0
      keyval.c
  8. 10 0
      keyval.h
  9. 9 0
      makefile
  10. 18 0
      stopif.h
  11. BIN
      test.docx
  12. 13 0
      xmlbuff.c
  13. 9 0
      xmlbuff.h
  14. 74 0
      zip.c
  15. 1 0
      zip.h

+ 16 - 2
README.md

@@ -1,2 +1,16 @@
-# anonymizeDOCXComments
-Word DOCX l10n XML
+# Anonymize DOCX Comments
+
+While doing review in Word documents translators/reviewers often use tracked changes and comments to exchange feedback on translations. Usually these people are from different organizations and shouldn't know about each other. Hence the need to anonymize comments and this is what this tool will do for you.
+
+It'll go through comments in "word/comments.xml" and change each author's name to Author<number>, where number starts from 1. It'll keep track of authors so "John Smith" will always be "Author1" for instance. After it's done it'll print list of authors and their new names.
+
+Usage:
+*./anonymize test.docx* - test.docx will be replaced with anonymized version.
+*./anonymize test.docx test2.docx* - anonymized version will be saved as test2.docx leaving original test.docx intact.
+
+Running it on provided *test.docx* should produce:
+	"King, Stephen" is now "Author1"
+	"Kowalski, Jan" is now "Author2"
+	"Piotr Fronczewski" is now "Author3"
+
+You'll need libarchive and libxml2 to compile it. It was created as learning project while I was exploring C, so use it freely, but at your own risk. Output was tested with Word 2013 and Libre Office Writer.

+ 14 - 0
anonymize.c

@@ -0,0 +1,14 @@
+#include <stdio.h>
+#include "zip.h"
+
+int main(int argc, char **argv) {
+	if (argc < 2) {
+		printf("Usage: %s <path_to_DOCX>\n", argv[0]);
+		printf("Optionaly provide output file as second argument.\n");
+	}
+		
+	if (argc > 2)
+		process(argv[1], argv[2]);
+	else if (argc > 1)
+		process(argv[1], NULL);
+}

+ 64 - 0
comments.c

@@ -0,0 +1,64 @@
+#define _GNU_SOURCE //asks stdio.h to include asprintf
+#include <stdio.h>
+#include <libxml2/libxml/xpath.h>
+#include <libxml2/libxml/xpathInternals.h>
+#include "stopif.h"
+#include "xmlbuff.h"
+#include "dict.h"
+
+static char* anonymizeAuthor(dictionary *authors, xmlChar const *authorName) {
+	char *name = (char*)authorName;
+	char *newName = (char*)dictionary_find(authors, name);
+
+	if (newName)
+		return newName;
+
+	asprintf(&newName, "Author%d", authors->length+1);
+	dictionary_add(authors, name, newName);
+	return newName;
+}
+
+static void printAuthors(dictionary *authors) {
+	for (int i=0; i<authors->length; i++)
+		printf("\"%s\" is now \"%s\"\n", authors->pairs[i]->key, (char*)authors->pairs[i]->value);
+}
+
+int anonymizeComments(XMLBuff *infile) {
+	dictionary *anonAuthors = dictionary_new();
+	const xmlChar *authorPath = (xmlChar*)"//w:comment/@w:author";
+
+	xmlDocPtr doc = xmlReadMemory(infile->data, infile->size, infile->name, NULL, 0);
+	Stopif(!doc, return -1, "Error: unable to parse file \"%s\"\n", infile->name);
+
+	xmlXPathContextPtr context = xmlXPathNewContext(doc);
+	Stopif(!context, return -1, "Error: unable to create new XPath context\n");
+
+	const xmlChar* prefix = (xmlChar*)"w";
+	const xmlChar* ns = (xmlChar*)"http://schemas.openxmlformats.org/wordprocessingml/2006/main";
+	Stopif(xmlXPathRegisterNs(context, prefix, ns), return -1, "Error: Can't add namespace!\n");
+
+	xmlXPathObjectPtr authors = xmlXPathEvalExpression(authorPath, context);
+	Stopif(!authors, return -1, "Something is wrong with XPATH %s\n", authorPath);
+
+	xmlChar *authorName = (xmlChar*)"";
+	for (int i=0; i < authors->nodesetval->nodeNr; i++){
+		authorName = xmlNodeGetContent(authors->nodesetval->nodeTab[i]);
+		char *anonAuthor = anonymizeAuthor(anonAuthors, authorName);
+		xmlNodeSetContent(authors->nodesetval->nodeTab[i], (xmlChar*)anonAuthor);
+	}
+
+	xmlChar *buf;
+	xmlDocDumpMemoryEnc(doc, &buf, &infile->size, "UTF-8");
+	infile->data = (char*)buf;
+
+	Stopif(!infile->size, return -1, "Errors: unable to save file %s\n", infile->name);
+
+	xmlXPathFreeObject(authors);
+	xmlXPathFreeContext(context);
+	xmlFreeDoc(doc);
+	xmlCleanupParser();
+	printAuthors(anonAuthors);
+	xmlFree(authorName);
+	dictionary_free(anonAuthors);
+	return 0;
+}

+ 3 - 0
comments.h

@@ -0,0 +1,3 @@
+#include "xmlbuff.h"
+
+int anonymizeComments(XMLBuff *infile);

+ 46 - 0
dict.c

@@ -0,0 +1,46 @@
+// Borrowed from https://github.com/b-k/21st-Century-Examples
+#include <stdio.h>
+#include <stdlib.h>
+#include "dict.h"
+
+void *dictionary_not_found;
+
+dictionary *dictionary_new (void){
+	static int dnf;
+	if (!dictionary_not_found) dictionary_not_found = &dnf;
+	dictionary *out= malloc(sizeof(dictionary));
+	*out= (dictionary){ };                          
+	return out;
+} 
+
+static void dictionary_add_keyval(dictionary *in, keyval *kv){
+	in->length++;
+	in->pairs = realloc(in->pairs, sizeof(keyval*)*in->length);
+	in->pairs[in->length-1] = kv;
+}
+
+void dictionary_add(dictionary *in, char *key, void *value){
+	if (!key){fprintf(stderr, "NULL is not a valid key.\n"); abort();}
+	dictionary_add_keyval(in, keyval_new(key, value));
+}
+
+void *dictionary_find(dictionary const *in, char const *key){
+	for (int i=0; i< in->length; i++)
+		if (keyval_matches(in->pairs[i], key))    
+			return in->pairs[i]->value;
+	return NULL;
+}
+
+dictionary *dictionary_copy(dictionary *in){
+	dictionary *out = dictionary_new();
+	for (int i=0; i< in->length; i++)
+		dictionary_add_keyval(out, keyval_copy(in->pairs[i]));
+	return out;
+}
+
+void dictionary_free(dictionary *in){
+	for (int i=0; i< in->length; i++)
+		keyval_free(in->pairs[i]);
+	free(in->pairs);
+	free(in);
+}

+ 15 - 0
dict.h

@@ -0,0 +1,15 @@
+// Borrowed from https://github.com/b-k/21st-Century-Examples
+#include "keyval.h"
+
+extern void *dictionary_not_found;
+
+typedef struct dictionary{
+   keyval **pairs;
+   int length;
+} dictionary;
+
+dictionary *dictionary_new (void);
+dictionary *dictionary_copy(dictionary *in);
+void dictionary_free(dictionary *in);
+void dictionary_add(dictionary *in, char *key, void *value);
+void *dictionary_find(dictionary const *in, char const *key);

+ 28 - 0
keyval.c

@@ -0,0 +1,28 @@
+// Borrowed from https://github.com/b-k/21st-Century-Examples
+#include <stdlib.h> //malloc
+#include <strings.h> //strcasecmp (from POSIX)
+#include "keyval.h"
+
+keyval *keyval_new(char *key, void *value){
+    keyval *out = malloc(sizeof(keyval));
+    *out = (keyval){.key = key, .value=value};
+    return out;
+}
+
+/** Copy a key/value pair. The new pair has pointers to
+  the values in the old pair, not copies of their data.  */
+keyval *keyval_copy(keyval const *in){
+    keyval *out = malloc(sizeof(keyval));
+    *out = *in;
+    return out;
+}
+
+void keyval_free(keyval *in){
+	free(in->key);
+	free(in->value);
+	free(in);
+}
+
+int keyval_matches(keyval const *in, char const *key){
+    return !strcasecmp(in->key, key);
+}

+ 10 - 0
keyval.h

@@ -0,0 +1,10 @@
+// Borrowed from https://github.com/b-k/21st-Century-Examples
+typedef struct keyval{
+   char *key;
+   void *value;
+} keyval;
+
+keyval *keyval_new(char *key, void *value);
+keyval *keyval_copy(keyval const *in);
+void keyval_free(keyval *in);
+int keyval_matches(keyval const *in, char const *key);

+ 9 - 0
makefile

@@ -0,0 +1,9 @@
+P=anonymize
+CFLAGS=`pkg-config --cflags --libs libxml-2.0` -g -Wall -O3 -std=gnu99
+LDLIBS=`pkg-config --libs libxml-2.0` -larchive
+objects=keyval.o dict.o comments.o zip.o xmlbuff.o
+
+$(P): $(objects)
+
+clean:
+	rm *.o

+ 18 - 0
stopif.h

@@ -0,0 +1,18 @@
+// Borrowed from https://github.com/b-k/21st-Century-Examples
+#include <stdio.h>
+#include <stdlib.h> //abort
+
+/** Set this to \c 's' to stop the program on an error.
+    Otherwise, functions return a value on failure.*/
+char error_mode;
+
+/** To where should I write errors? If this is \c NULL, write to \c stderr. */
+FILE *error_log;
+
+#define Stopif(assertion, error_action, ...) {                    \
+        if (assertion){                                           \
+            fprintf(error_log ? error_log : stderr, __VA_ARGS__); \
+            fprintf(error_log ? error_log : stderr, "\n");        \
+            if (error_mode=='s') abort();                         \
+            else                 {error_action;}                  \
+        } }

BIN
test.docx


+ 13 - 0
xmlbuff.c

@@ -0,0 +1,13 @@
+#include <stdlib.h>
+#include "xmlbuff.h"
+
+XMLBuff *XMLBuffNew(void) {
+	XMLBuff *out= malloc(sizeof(XMLBuff));
+	*out= (XMLBuff){ };                          
+	return out;
+}
+
+void XMLBuffFree(XMLBuff *in) {
+	free(in->data);
+	free(in);
+}

+ 9 - 0
xmlbuff.h

@@ -0,0 +1,9 @@
+typedef struct XMLBuff
+{
+	char *data;
+	const char *name;
+	int size;
+} XMLBuff;
+
+XMLBuff *XMLBuffNew(void);
+void XMLBuffFree(XMLBuff *in);

+ 74 - 0
zip.c

@@ -0,0 +1,74 @@
+#include <stdio.h>
+#include <archive.h>
+#include <archive_entry.h>
+#include <string.h>
+#include "comments.h"
+#include "stopif.h"
+
+static int processComments(struct archive *archiveOut, char buf[], size_t size, const char* path) {
+	XMLBuff *comments = XMLBuffNew();
+	*comments = (XMLBuff){.data=buf, .size=size, .name=path};
+	anonymizeComments(comments);
+
+	struct archive_entry *newEntry = archive_entry_new();
+	archive_entry_set_pathname(newEntry, path);
+	archive_entry_set_size(newEntry, comments->size);
+	archive_entry_set_filetype(newEntry, AE_IFREG);
+	archive_entry_set_perm(newEntry, 0664);
+
+	Stopif(archive_write_header(archiveOut, newEntry) != ARCHIVE_OK, return -2, "Can't write entry header (comments)!\n");
+	Stopif(archive_write_data(archiveOut, comments->data, comments->size) != comments->size, return -3, "Can't write data (comments)!\n");
+	archive_entry_free(newEntry);
+	XMLBuffFree(comments);
+	return 1;
+}
+
+static int rewriteZIP(struct archive *archiveIn, struct archive *archiveOut) {
+	const char *commentsFile = "word/comments.xml";
+	struct archive_entry *entryIn;
+
+	while (archive_read_next_header(archiveIn, &entryIn) == ARCHIVE_OK) {
+		const char* path = archive_entry_pathname(entryIn);
+		size_t size = archive_entry_size(entryIn);
+		char buf[size];
+		Stopif(archive_read_data(archiveIn, buf, size) != size, return -2, "Archive entry has no size (%s)!\n", path);
+
+		if (strcmp(commentsFile, path) == 0){
+			Stopif(!processComments(archiveOut, buf, size, path), return -1, "Can't process comments!\n");
+		} else {
+			Stopif(archive_write_header(archiveOut, entryIn) != ARCHIVE_OK, return -2, "Can't write entry header!\n");
+			Stopif(archive_write_data(archiveOut, buf, size) != size, return -3, "Can't write data!\n");
+		}
+	}
+	return 1;
+}
+
+static int processDOCX(char const *infile, char const *outfile) {
+	struct archive *archiveIn;
+	struct archive *archiveOut;
+
+	archiveIn = archive_read_new();
+	archive_read_support_format_zip(archiveIn);
+	Stopif(archive_read_open_filename(archiveIn, infile, 10240), return -1, "Can't read file %s\n!", infile);
+
+	archiveOut = archive_write_new();
+	archive_write_set_format_zip(archiveOut);
+
+	Stopif(archive_write_open_filename(archiveOut, outfile) != ARCHIVE_OK, return -1, "Can't create new archive %s\n", outfile);
+
+	Stopif(!rewriteZIP(archiveIn, archiveOut), return -1, "Problems rewriting zip!\n");
+	Stopif(archive_read_free(archiveIn) != ARCHIVE_OK, return -1, "Can't free %s!\n", infile);
+	Stopif(archive_write_free(archiveOut) != ARCHIVE_OK, return -1, "Can't free %s!\n", outfile);
+	return 1;
+}
+
+int process(char const *infile, char *outfile) {
+	if (!outfile || strcmp(infile, outfile) == 0){
+		const char *outfile = "tmpFile.docx";
+		processDOCX(infile, outfile);
+		rename(outfile, infile);
+	} else {
+		processDOCX(infile, outfile);
+	}
+	return 1;
+}

+ 1 - 0
zip.h

@@ -0,0 +1 @@
+int process(char const *infile, char *outfile);