Browse Source

NoDuplicatedSource.cs

Piotr Czajkowski 6 months ago
commit
55ad7e9c41
1 changed files with 95 additions and 0 deletions
  1. 95 0
      20210624/NoDuplicatedSource.cs

+ 95 - 0
20210624/NoDuplicatedSource.cs

@@ -0,0 +1,95 @@
+// Remove segments where source is duplicated from TMX file while streaming it
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Xml;
+using System.Xml.Linq;
+
+namespace DuplicatesInTMX
+{
+    class Program
+    {
+        static XElement ReadHeader(XmlReader reader)
+        {
+            if (reader == null)
+                throw new ArgumentNullException("reader");
+
+            reader.MoveToContent();
+
+            while (reader.Read())
+            {
+                if (reader.NodeType == XmlNodeType.Element
+                    && reader.Name == "header")
+                {
+                    var header = XElement.ReadFrom(reader) as XElement;
+                    return header;
+                }
+            }
+
+            return null;
+        }
+        static IEnumerable<XElement> NoDuplicatedSource(XmlReader reader)
+        {
+            if (reader == null)
+                throw new ArgumentNullException("reader");
+
+            var duplicates = new Dictionary<string, bool>();
+            while (reader.Read())
+            {
+                if (reader.NodeType == XmlNodeType.Element
+                    && reader.Name == "tu")
+                {
+                    var tu = XElement.ReadFrom(reader) as XElement;
+                    var tuv = tu.Element("tuv");
+                    if (tuv == null)
+                        continue;
+
+                    var source = tuv.Element("seg");
+                    if (source == null)
+                        continue;
+
+                    var sourceText = source.ToString();
+                    if (duplicates.ContainsKey(sourceText))
+                        continue;
+
+                    duplicates.Add(sourceText, true);
+                    yield return tu;
+                }
+            }
+        }
+
+        static void Main(string[] args)
+        {
+            if (!args.Any())
+            {
+                Console.WriteLine("You need to specify a path to TMX file!");
+                return;
+            }
+
+            XmlReaderSettings settings = new XmlReaderSettings()
+            {
+                DtdProcessing = DtdProcessing.Ignore
+            };
+
+            using (XmlReader reader = XmlReader.Create(args[0], settings))
+            {
+                var version = new XAttribute("version", "1.4");
+                var root = new XStreamingElement("tmx");
+                root.Add(version);
+
+                var header = ReadHeader(reader);
+                if (header == null)
+                    throw new NullReferenceException("There's no header in the file!");
+
+                root.Add(header);
+
+                var body = new XStreamingElement("body",
+                    from el in NoDuplicatedSource(reader)
+                    select el);
+
+                root.Add(body);
+                root.Save("output.tmx");
+            }
+        }
+    }
+}