Mercurial > hg > Members > anatofuz > lectable
changeset 12:e00b5301263a
def convert pdf function
author | anatofuz <anatofuz@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Wed, 01 Apr 2020 09:31:19 +0900 |
parents | e7d9f63d969c |
children | 989cfda07d71 |
files | cmd_pdf.go go.mod go.sum pdf.go |
diffstat | 4 files changed, 60 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- a/cmd_pdf.go Tue Mar 31 19:15:18 2020 +0900 +++ b/cmd_pdf.go Wed Apr 01 09:31:19 2020 +0900 @@ -2,6 +2,7 @@ import ( "context" + "fmt" "io" ) @@ -16,5 +17,7 @@ } func (cd *cmdPDF) run(ctx context.Context, argv []string, outStream, errStream io.Writer) error { + h, _ := convertStringFromPDF(&argv) + fmt.Println(h) return nil }
--- a/go.mod Tue Mar 31 19:15:18 2020 +0900 +++ b/go.mod Wed Apr 01 09:31:19 2020 +0900 @@ -3,6 +3,8 @@ go 1.14 require ( + github.com/ledongthuc/pdf v0.0.0-20200323191019-23c5852adbd2 github.com/pkg/errors v0.9.1 + golang.org/x/text v0.3.2 golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 )
--- a/go.sum Tue Mar 31 19:15:18 2020 +0900 +++ b/go.sum Wed Apr 01 09:31:19 2020 +0900 @@ -1,4 +1,11 @@ +github.com/dcu/pdf v0.0.0-20190612170416-c8b299d05f89 h1:h8vPbGmLBQY15p63w4I9dGQWc6YtR3p+uG1j/T+GrF4= +github.com/dcu/pdf v0.0.0-20190612170416-c8b299d05f89/go.mod h1:Y73szhmilZ/gaudo99AA26HG9ldrKYcVIyKBne65nMQ= +github.com/ledongthuc/pdf v0.0.0-20200323191019-23c5852adbd2 h1:H9HhyvygtvWnn1R8ymra4vdIUOvDDlaPlX6mjoJ9UTY= +github.com/ledongthuc/pdf v0.0.0-20200323191019-23c5852adbd2/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs= +golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pdf.go Wed Apr 01 09:31:19 2020 +0900 @@ -0,0 +1,48 @@ +package lectable + +import ( + "bytes" + "regexp" + "strings" + + "github.com/ledongthuc/pdf" + "github.com/pkg/errors" + "golang.org/x/text/unicode/norm" +) + +type lectureFPDF struct { + id string `json:id` + body string `json:body` + isSelect bool `json:isSelect` + day string `json:day` + place string `json:place` +} + +var idRgex = regexp.MustCompile(`\d{9}`) + +type pdfPagesStr []string + +func convertStringFromPDF(pdfPaths *[]string) (*[]pdfPagesStr, error) { + var ppsr []pdfPagesStr + for _, apdf := range *pdfPaths { + file, reader, err := pdf.Open(apdf) + defer file.Close() + if err != nil { + return nil, errors.Wrap(err, "failed open pdf") + } + + var buf bytes.Buffer + preader, err := reader.GetPlainText() + if err != nil { + return nil, errors.Wrap(err, "failed convert pdf") + } + + buf.ReadFrom(preader) + planString := string(norm.NFKC.Bytes(buf.Bytes())) + + planstrings := strings.Split(planString, " ") // " " is next pdf page + planstrings = planstrings[1:] // 最初はゴミ + ppsr = append(ppsr, planstrings) + } + return &ppsr, nil +}