Skip to Main Content

HathiTrust Research Center

Extracted Features Dataset

HTRC's Extracted Features Dataset consists of about 17.1 million bzip2 compressed JSON files corresponding to each volume in HathiTrust at the time of the last Extracted Features Dataset update (v.2.0). Each volume's JSON file contains some volume-level metadata and a set of features for each page in that volume. Each page is split into a header, body, and footer, each of which has:

  • A count of its tokens, lines, blank lines, and sentences
  • The beginning and ending character for each line (very handy for identifying tables-of-contents and poetry)
  • A count of the frequency of each word on the page subdivided by the part-of-speech it was classified under. Extracted Features Dataset v.2.0 used Stanford NLP to tag words with Penn Treebank POS tags in English.

Because these structural attributes and word counts are non-consumptive, they are also available for in-copyright works. The HTRC Extracted Feature Dataset provides pre-formulated research data up to the full scale of the collection, but lacks the flexibility provided by HTRC Data Capsules, which let researchers process full text in its original order.

HTRC offers an Extracted Features Download Helper that formulates an rsync download script for works in an HTRC Workset.

Example

Below is a page from a HathiTrust copy of an 1897 reprint of Tottel's Miscellany:

Page 122 from a reprint of Tottel's Miscellany

The following is an abridged example of volume and page-level JSON data for the above page, with omissions designated by ...:

{
  "@context": "https://worksets.htrc.illinois.edu/context/ef_context.jsonld",
  "schemaVersion": "https://schemas.hathitrust.org/EF_Schema_v_3.0",
  "id": "https://data.analytics.hathitrust.org/extracted-features/20200210/mdp.39015056246112",
  "htid": "mdp.39015056246112",
  "type": "DataFeed",
  "publisher": {
    "id": "https://analytics.hathitrust.org",
    "type": "Organization",
    "name": "HathiTrust Research Center"
  },
  "datePublished": 20200210,
  "metadata": {
    "schemaVersion": "https://schemas.hathitrust.org/EF_Schema_MetadataSubSchema_v_3.0",
    "id": "http://hdl.handle.net/2027/mdp.39015056246112",
    "type": [
      "DataFeedItem",
      "Book"
    ],
    "dateCreated": 20200209,
    "title": "Tottel's miscellany : songes and sonnettes /",
    "contributor": [
      {
        "id": "http://www.viaf.org/viaf/79450",
        "type": "http://id.loc.gov/ontologies/bibframe/Person",
        "name": "Tottel, Richard, d. 1594."
      },
      ...
   ],
    "pubDate": 1897,
    "publisher": {
      "id": "http://catalogdata.library.illinois.edu/lod/entities/ProvisionActivityAgent/ht/A.%20Constable%20and%20Co.",
      "type": "http://id.loc.gov/ontologies/bibframe/Organization",
      "name": "A. Constable and Co."
    },
    "pubPlace": {
      "id": "http://id.loc.gov/vocabulary/countries/enk",
      "type": "http://id.loc.gov/ontologies/bibframe/Place",
      "name": "England"
    },
    "language": "eng",
    "accessRights": "pd",
    "accessProfile": "google",
    "sourceInstitution": {
      "type": "http://id.loc.gov/ontologies/bibframe/Organization",
      "name": "MIU"
    },
    "mainEntityOfPage": [
      "https://catalog.hathitrust.org/Record/003830169",
      "http://catalog.hathitrust.org/api/volumes/brief/oclc/22833386.json",
      "http://catalog.hathitrust.org/api/volumes/full/oclc/22833386.json"
    ],
    "oclc": "22833386",
    "genre": "http://id.loc.gov/vocabulary/marcgt/doc",
    "typeOfResource": "http://id.loc.gov/ontologies/bibframe/Text",
    "lastRightsUpdateDate": 20161013
  },
  "features": {
    "schemaVersion": "https://schemas.hathitrust.org/EF_Schema_FeaturesSubSchema_v_3.0",
    "id": "http://hdl.handle.net/2027/mdp.39015056246112",
    "type": "DataFeedItem",
    "dateCreated": 20200124,
    "pageCount": 328,
    "pages": [
      ...
      {
        "seq": "00000142",
        "version": "8dc88b60eb813a9d6f22eb627d735b89",
        "tokenCount": 388,
        "lineCount": 43,
        "emptyLineCount": 0,
        "sentenceCount": 11,
        "header": {
          "tokenCount": 8,
          "lineCount": 3,
          "emptyLineCount": 0,
          "sentenceCount": 2,
          "capAlphaSeq": 1,
          "beginCharCount": {"S": 1, "[": 1, "1": 1},
          "endCharCount": {"2": 1, ".": 1, "]": 1},
          "tokenPosCount": {
            ".": {
              ".": 2
            },
            "Songes": {
              "NNP": 1
            },
            "ald": {
              "NN": 1
            },
            "-LSB-": {
              "-LRB-": 1
            },
            "122": {
              "CD": 1
            },
            "-RSB-": {
              "-RRB-": 1
            },
            "Grim": {
              "JJ": 1
            }
          }
        },
        "body": {
          "tokenCount": 380,
          "lineCount": 40,
          "emptyLineCount": 0,
          "sentenceCount": 9,
          "capAlphaSeq": 4,
          "beginCharCount": {
            "T": 8,
            "F": 2,
            "A": 6,
            "M": 1,
            "1": 2,
            "L": 1,
            "B": 5,
            "P": 1,
            "0": 1,
            "H": 2,
            "W": 6,
            "O": 2,
            "D": 1,
            "S": 2
          },
          "endCharCount": {
            "e": 2,
            "s": 1,
            "n": 2,
            ".": 5,
            "t": 1,
            "f": 1,
            "m": 1,
            ",": 13,
            "g": 1,
            "?": 2,
            "h": 1,
            ":": 10
          },
          "tokenPosCount": {
            ...
            "Macedonian": {
              "JJ": 1
            },
            "My": {
              "PRP$": 1
            },
            "Persuades": {
              "VBZ": 1
            },
            "woords": {
              "NNS": 1
            },
            "greaue": {
              "NN": 1
            },
            "ruthe": {
              "NN": 1
            },
            "do": {
              "VBP": 1
            },
            "all": {
              "DT": 3,
              "PDT": 1
            },
            "him": {
              "PRP": 4
            },
            "Meleager": {
              "NNP": 1
            },
            ...
          }
        },
        "footer": null,
        "calculatedLanguage": "en"
      },
      ...
    ]
  }
}