Facebook
From Lousy Frog, 3 Years ago, written in Plain Text.
Embed
Download Paste or View Raw
Hits: 101
  1. ///
  2. /// @file
  3. /// @author Julius Pettersson
  4. /// @copyright MIT/Expat License.
  5. /// @brief LZW file compressor
  6. /// @version 5
  7. /// @remark This version borrows heavily from Juha Nieminen's work.
  8. ///
  9. /// This is the C++11 implementation of a Lempel-Ziv-Welch single-file command-line compressor.
  10. /// It uses the simpler fixed-width code compression method.
  11. /// It was written with Doxygen comments.
  12. ///
  13. /// @see http://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Welch
  14. /// @see http://marknelson.us/2011/11/08/lzw-revisited/
  15. /// @see http://www.cs.duke.edu/csed/curious/compression/lzw.html
  16. /// @see http://warp.povusers.org/EfficientLZW/index.html
  17. /// @see http://en.cppreference.com/
  18. /// @see http://www.doxygen.org/
  19. ///
  20.  
  21. #include <algorithm>
  22. #include <array>
  23. #include <climits>
  24. #include <cstddef>
  25. #include <cstdint>
  26. #include <cstdlib>
  27. #include <exception>
  28. #include <fstream>
  29. #include <ios>
  30. #include <iostream>
  31. #include <istream>
  32. #include <limits>
  33. #include <memory>
  34. #include <ostream>
  35. #include <stdexcept>
  36. #include <string>
  37. #include <utility>
  38. #include <vector>
  39.  
  40. /// Type used to store and retrieve codes.
  41. using CodeType = std::uint16_t;
  42.  
  43. namespace globals {
  44.  
  45. /// Dictionary Maximum Size (when reached, the dictionary will be reset)
  46. const CodeType dms {std::numeric_limits<CodeType>::max()};
  47.  
  48. } // namespace globals
  49.  
  50. ///
  51. /// @brief Encoder's custom dictionary type.
  52. ///
  53. class EncoderDictionary {
  54.  
  55.     ///
  56.     /// @brief Binary search tree node.
  57.     ///
  58.     struct Node {
  59.  
  60.         ///
  61.         /// @brief Default constructor.
  62.         /// @param c    byte that the Node will contain
  63.         ///
  64.         explicit Node(char c): first(globals::dms), c(c), left(globals::dms), right(globals::dms)
  65.         {
  66.         }
  67.  
  68.         CodeType    first;  ///< Code of first child string.
  69.         char        c;      ///< Byte.
  70.         CodeType    left;   ///< Code of child node with byte < `c`.
  71.         CodeType    right;  ///< Code of child node with byte > `c`.
  72.     };
  73.  
  74. public:
  75.  
  76.     ///
  77.     /// @brief Default constructor.
  78.     /// @details It builds the `initials` cheat sheet.
  79.     ///
  80.     EncoderDictionary()
  81.     {
  82.         const long int minc = std::numeric_limits<char>::min();
  83.         const long int maxc = std::numeric_limits<char>::max();
  84.         CodeType k {0};
  85.  
  86.         for (long int c = minc; c <= maxc; ++c)
  87.             initials[static_cast<unsigned char> (c)] = k++;
  88.  
  89.         vn.reserve(globals::dms);
  90.         reset();
  91.     }
  92.  
  93.     ///
  94.     /// @brief Resets dictionary to its initial contents.
  95.         /// @see `EncoderDictionary::EncoderDictionary()`
  96.     ///
  97.     void reset()
  98.     {
  99.         vn.clear();
  100.  
  101.         const long int minc = std::numeric_limits<char>::min();
  102.         const long int maxc = std::numeric_limits<char>::max();
  103.  
  104.         for (long int c = minc; c <= maxc; ++c)
  105.             vn.push_back(Node(c));
  106.     }
  107.  
  108.     ///
  109.     /// @brief Searches for a pair (`i`, `c`) and inserts the pair if it wasn't found.
  110.     /// @param i                code to search for
  111.     /// @param c                attached byte to search for
  112.     /// @returns The index of the pair, if it was found.
  113.     /// @retval globals::dms    if the pair wasn't found
  114.     ///
  115.     CodeType search_and_insert(CodeType i, char c)
  116.     {
  117.         // dictionary's maximum size was reached
  118.         if (vn.size() == globals::dms)
  119.             reset();
  120.  
  121.         if (i == globals::dms)
  122.             return search_initials(c);
  123.  
  124.         const CodeType vn_size = vn.size();
  125.         CodeType ci {vn[i].first}; // Current Index
  126.  
  127.         if (ci != globals::dms)
  128.         {
  129.             while (true)
  130.                 if (c < vn[ci].c)
  131.                 {
  132.                     if (vn[ci].left == globals::dms)
  133.                     {
  134.                         vn[ci].left = vn_size;
  135.                         break;
  136.                     }
  137.                     else
  138.                         ci = vn[ci].left;
  139.                 }
  140.                 else
  141.                 if (c > vn[ci].c)
  142.                 {
  143.                     if (vn[ci].right == globals::dms)
  144.                     {
  145.                         vn[ci].right = vn_size;
  146.                         break;
  147.                     }
  148.                     else
  149.                         ci = vn[ci].right;
  150.                 }
  151.                 else // c == vn[ci].c
  152.                     return ci;
  153.         }
  154.         else
  155.             vn[i].first = vn_size;
  156.  
  157.         vn.push_back(Node(c));
  158.         return globals::dms;
  159.     }
  160.  
  161.     ///
  162.     /// @brief Fakes a search for byte `c` in the one-byte area of the dictionary.
  163.     /// @param c    byte to search for
  164.     /// @returns The code associated to the searched byte.
  165.     ///
  166.     CodeType search_initials(char c) const
  167.     {
  168.         return initials[static_cast<unsigned char> (c)];
  169.     }
  170.  
  171. private:
  172.  
  173.     /// Vector of nodes on top of which the binary search tree is implemented.
  174.     std::vector<Node> vn;
  175.  
  176.     /// Cheat sheet for mapping one-byte strings to their codes.
  177.     std::array<CodeType, 1u << CHAR_BIT> initials;
  178. };
  179.  
  180. ///
  181. /// @brief Compresses the contents of `is` and writes the result to `os`.
  182. /// @param [in] is      input stream
  183. /// @param [out] os     output stream
  184. ///
  185. void compress(std::istream &is, std::ostream &os)
  186. {
  187.     EncoderDictionary ed;
  188.     CodeType i {globals::dms}; // Index
  189.     char c;
  190.  
  191.     while (is.get(c))
  192.     {
  193.         const CodeType temp {i};
  194.  
  195.         if ((i = ed.search_and_insert(temp, c)) == globals::dms)
  196.         {
  197.             os.write(reinterpret_cast<const char *> (&temp), sizeof (CodeType));
  198.             i = ed.search_initials(c);
  199.         }
  200.     }
  201.  
  202.     if (i != globals::dms)
  203.         os.write(reinterpret_cast<const char *> (&i), sizeof (CodeType));
  204. }
  205.  
  206. ///
  207. /// @brief Decompresses the contents of `is` and writes the result to `os`.
  208. /// @param [in] is      input stream
  209. /// @param [out] os     output stream
  210. ///
  211. void decompress(std::istream &is, std::ostream &os)
  212. {
  213.     std::vector<std::pair<CodeType, char>> dictionary;
  214.  
  215.     // "named" lambda function, used to reset the dictionary to its initial contents
  216.     const auto reset_dictionary = [&dictionary] {
  217.         dictionary.clear();
  218.         dictionary.reserve(globals::dms);
  219.  
  220.         const long int minc = std::numeric_limits<char>::min();
  221.         const long int maxc = std::numeric_limits<char>::max();
  222.  
  223.         for (long int c = minc; c <= maxc; ++c)
  224.             dictionary.push_back({globals::dms, static_cast<char> (c)});
  225.     };
  226.  
  227.     const auto rebuild_string = [&dictionary](CodeType k) -> const std::vector<char> * {
  228.         static std::vector<char> s; // String
  229.  
  230.         s.clear();
  231.  
  232.         // the length of a string cannot exceed the dictionary's number of entries
  233.         s.reserve(globals::dms);
  234.  
  235.         while (k != globals::dms)
  236.         {
  237.             s.push_back(dictionary[k].second);
  238.             k = dictionary[k].first;
  239.         }
  240.  
  241.         std::reverse(s.begin(), s.end());
  242.         return &s;
  243.     };
  244.  
  245.     reset_dictionary();
  246.  
  247.     CodeType i {globals::dms}; // Index
  248.     CodeType k; // Key
  249.  
  250.     while (is.read(reinterpret_cast<char *> (&k), sizeof (CodeType)))
  251.     {
  252.         // dictionary's maximum size was reached
  253.         if (dictionary.size() == globals::dms)
  254.             reset_dictionary();
  255.  
  256.         if (k > dictionary.size())
  257.             throw std::runtime_error("invalid compressed code");
  258.  
  259.         const std::vector<char> *s; // String
  260.  
  261.         if (k == dictionary.size())
  262.         {
  263.             dictionary.push_back({i, rebuild_string(i)->front()});
  264.             s = rebuild_string(k);
  265.         }
  266.         else
  267.         {
  268.             s = rebuild_string(k);
  269.  
  270.             if (i != globals::dms)
  271.                 dictionary.push_back({i, s->front()});
  272.         }
  273.  
  274.         os.write(&s->front(), s->size());
  275.         i = k;
  276.     }
  277.  
  278.     if (!is.eof() || is.gcount() != 0)
  279.         throw std::runtime_error("corrupted compressed file");
  280. }
  281.  
  282. ///
  283. /// @brief Prints usage information and a custom error message.
  284. /// @param s    custom error message to be printed
  285. /// @param su   Show Usage information
  286. ///
  287. void print_usage(const std::string &s = "", bool su = true)
  288. {
  289.     if (!s.empty())
  290.         std::cerr << "\nERROR: " << s << '\n';
  291.  
  292.     if (su)
  293.     {
  294.         std::cerr << "\nUsage:\n";
  295.         std::cerr << "\tprogram -flag input_file output_file\n\n";
  296.         std::cerr << "Where `flag' is either `c' for compressing, or `d' for decompressing, and\n";
  297.         std::cerr << "`input_file' and `output_file' are distinct files.\n\n";
  298.         std::cerr << "Examples:\n";
  299.         std::cerr << "\tlzw_v5.exe -c license.txt license.lzw\n";
  300.         std::cerr << "\tlzw_v5.exe -d license.lzw new_license.txt\n";
  301.     }
  302.  
  303.     std::cerr << std::endl;
  304. }
  305.  
  306. ///
  307. /// @brief Actual program entry point.
  308. /// @param argc             number of command line arguments
  309. /// @param [in] argv        array of command line arguments
  310. /// @retval EXIT_FAILURE    for failed operation
  311. /// @retval EXIT_SUCCESS    for successful operation
  312. ///
  313. int main(int argc, char *argv[])
  314. {
  315.     if (argc != 4)
  316.     {
  317.         print_usage("Wrong number of arguments.");
  318.         return EXIT_FAILURE;
  319.     }
  320.  
  321.     enum class Mode {
  322.         Compress,
  323.         Decompress
  324.     };
  325.  
  326.     Mode m;
  327.  
  328.     if (std::string(argv[1]) == "-c")
  329.         m = Mode::Compress;
  330.     else
  331.     if (std::string(argv[1]) == "-d")
  332.         m = Mode::Decompress;
  333.     else
  334.     {
  335.         print_usage(std::string("flag `") + argv[1] + "' is not recognized.");
  336.         return EXIT_FAILURE;
  337.     }
  338.  
  339.     const std::size_t buffer_size {1024 * 1024};
  340.  
  341.     // these custom buffers should be larger than the default ones
  342.     const std::unique_ptr<char[]> input_buffer(new char[buffer_size]);
  343.     const std::unique_ptr<char[]> output_buffer(new char[buffer_size]);
  344.  
  345.     std::ifstream input_file;
  346.     std::ofstream output_file;
  347.  
  348.     input_file.rdbuf()->pubsetbuf(input_buffer.get(), buffer_size);
  349.     input_file.open(argv[2], std::ios_base::binary);
  350.  
  351.     if (!input_file.is_open())
  352.     {
  353.         print_usage(std::string("input_file `") + argv[2] + "' could not be opened.");
  354.         return EXIT_FAILURE;
  355.     }
  356.  
  357.     output_file.rdbuf()->pubsetbuf(output_buffer.get(), buffer_size);
  358.     output_file.open(argv[3], std::ios_base::binary);
  359.  
  360.     if (!output_file.is_open())
  361.     {
  362.         print_usage(std::string("output_file `") + argv[3] + "' could not be opened.");
  363.         return EXIT_FAILURE;
  364.     }
  365.  
  366.     try
  367.     {
  368.         input_file.exceptions(std::ios_base::badbit);
  369.         output_file.exceptions(std::ios_base::badbit | std::ios_base::failbit);
  370.  
  371.         if (m == Mode::Compress)
  372.             compress(input_file, output_file);
  373.         else
  374.         if (m == Mode::Decompress)
  375.             decompress(input_file, output_file);
  376.     }
  377.     catch (const std::ios_base::failure &f)
  378.     {
  379.         print_usage(std::string("File input/output failure: ") + f.what() + '.', false);
  380.         return EXIT_FAILURE;
  381.     }
  382.     catch (const std::exception &e)
  383.     {
  384.         print_usage(std::string("Caught exception: ") + e.what() + '.', false);
  385.         return EXIT_FAILURE;
  386.     }
  387.  
  388.     return EXIT_SUCCESS;
  389. }